# Laboratory work #5 (vector database search)

In [1]:
import pandas as pd
import re

from tqdm import tqdm
import nltk
nltk.download('wordnet')
tqdm.pandas()

from sentence_transformers import SentenceTransformer
from db_utils import ChromaDataBase

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
random_seed = 42

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [3]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [4]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [5]:
df_fake.shape, df_true.shape

((17455, 5), (21192, 5))

In [6]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10, random_state=random_seed)

Unnamed: 0,title,text,subject,date,class
18652,Kenya watchdog says investigating police over ...,(This September 29 has been corrected to fix ...,worldnews,"September 29, 2017",1
5890,Factbox: What's in Trump's order halting refug...,(Reuters) - U.S. President Donald Trump said o...,politicsNews,"January 28, 2017",1
21039,"Kosovo parties sign deal to form government, e...",PRISTINA (Reuters) - Kosovo s center-right coa...,worldnews,"September 4, 2017",1
4137,Trump greets Egyptian-American freed from Egyp...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"April 21, 2017",1
4156,Letter To The Editor Claims Hillary Is Unfit ...,A Pennsylvania man apparently failed basic bio...,News,"October 17, 2016",0
14283,BREAKING: SCREW THE FRONT-RUNNERS Chosen By “W...,Here s the REAL reason Mitt Romney came out ag...,politics,"Mar 16, 2016",0
5236,Congress' Trump Russia probe takes partisan turn,WASHINGTON (Reuters) - The head of a congressi...,politicsNews,"February 27, 2017",1
4775,Longtime GOP Consultant Throws His Party Unde...,"A longtime Republican consultant, Carter Wrenn...",News,"September 4, 2016",0
20939,U.S. Virgin Islands seaports closed ahead of I...,HOUSTON (Reuters) - The U.S. Virgin Islands se...,worldnews,"September 5, 2017",1
20298,DETROIT COP UNDER FIRE FOR FACEBOOK POST: “The...,If this cop s comments were in lock-step with ...,left-news,"Jul 11, 2016",0


In [7]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

text     0
class    0
dtype: int64

In [8]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [9]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [10]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [11]:
df.head(10)

Unnamed: 0,text,class,part
0,Trump and the House GOP suffered a stunning de...,0,test
1,"LJUBLJANA (Reuters) - Slovenia, birth country ...",1,train
2,"During the 2016 presidential campaign, Preside...",0,test
3,MANILA (Reuters) - Philippine President Rodrig...,1,train
4,MADRID (Reuters) - Catalan pro-independence pa...,1,train
5,MADRID (Reuters) - More than half of Spanish v...,1,train
6,DUBAI (Reuters) - The United Arab Emirates is ...,1,train
7,"WINSTON-SALEM, N.C. (Reuters) - Minority voter...",1,train
8,CARACAS (Reuters) - Venezuelan President Nicol...,1,test
9,WASHINGTON/LONDON (Reuters) - Republican presi...,1,train


In [12]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

(30918, 3863, 3866)

In [13]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [14]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

0 Trump and the House GOP suffered a stunning defeat on legislation they promised would deliver better healthcare to millions, despite that promise being a lie. They pulled the Trumpcare bill from the floor early Friday afternoon, before getting a vote, because they knew it wouldn t pass. And that was absolutely humiliating for them.But what may be more humiliating is what the American Action Network a conservative PAC did on Friday night   the exact same day the bill failed. During Friday night basketball games, they ran television ads about the triumph of Trumpcare over Obamacare, saying the following: Republicans are keeping their promise with a new plan for better hrealthcare, More choices and lower costs, putting doctors and patients in charge again. No more big government penalties or job killing mandates. They go on to explain that the tax credits contained within Trumpcare would make healthcare more affordable, and that protections for people with pre-existing conditions would 

In [15]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [16]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

Boos and chants of  Lock her up!
were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.
#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit


In [17]:
def split_into_words(sentences):
    # Regular expression to match URLs, hashtags, handles, words, and standalone punctuation
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|[\w\'-]+|[.,!?;]')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

['Boos', 'and', 'chants', 'of', 'Lock', 'her', 'up', '!']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', '.', 'S', '.', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'was', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump', '.']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', ',', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', ',', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'as', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', ',', '2017Via', 'Gateway', 'Pundit']


In [18]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [19]:
df['sentences'] = df['text'].apply(process_text)

In [20]:
df

Unnamed: 0,text,class,part,sentences
0,Trump and the House GOP suffered a stunning de...,0,test,"[[Trump, and, the, House, GOP, suffered, a, st..."
1,"LJUBLJANA (Reuters) - Slovenia, birth country ...",1,train,"[[LJUBLJANA, Reuters, -, Slovenia, ,, birth, c..."
2,"During the 2016 presidential campaign, Preside...",0,test,"[[During, the, 2016, presidential, campaign, ,..."
3,MANILA (Reuters) - Philippine President Rodrig...,1,train,"[[MANILA, Reuters, -, Philippine, President, R..."
4,MADRID (Reuters) - Catalan pro-independence pa...,1,train,"[[MADRID, Reuters, -, Catalan, pro, -independe..."
...,...,...,...,...
38642,"DETROIT (Reuters) - Recording star Kid Rock, a...",1,train,"[[DETROIT, Reuters, -, Recording, star, Kid, R..."
38643,He threw the reputation of the FBI under the ...,0,train,"[[He, threw, the, reputation, of, the, FBI, un..."
38644,"Omarosa Manigault, a senior staff member of Pr...",0,test,"[[Omarosa, Manigault, ,, a, senior, staff, mem..."
38645,After Roy Moore s ugly loss in the Alabama Sen...,0,train,"[[After, Roy, Moore, s, ugly, loss, in, the, A..."


In [21]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

In [22]:
def vectorize_sentences(sentences):
    joined_sentences = [' '.join(sentence) for sentence in sentences]
    return model.encode(joined_sentences)

In [23]:
df = df.iloc[:5000, :]

In [24]:
df['sentence_vectors'] = df['sentences'].progress_apply(vectorize_sentences)

100%|██████████| 5000/5000 [00:38<00:00, 129.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence_vectors'] = df['sentences'].progress_apply(vectorize_sentences)


In [25]:
len(df.iloc[0, -2]), df.iloc[0, -1].shape

(8, (8, 384))

In [26]:
df.head()

Unnamed: 0,text,class,part,sentences,sentence_vectors
0,Trump and the House GOP suffered a stunning de...,0,test,"[[Trump, and, the, House, GOP, suffered, a, st...","[[-0.0024006283, 0.12530625, 0.08491166, 0.026..."
1,"LJUBLJANA (Reuters) - Slovenia, birth country ...",1,train,"[[LJUBLJANA, Reuters, -, Slovenia, ,, birth, c...","[[0.09181142, -0.028166108, -0.03686961, -0.00..."
2,"During the 2016 presidential campaign, Preside...",0,test,"[[During, the, 2016, presidential, campaign, ,...","[[0.062725924, 0.04989782, 0.01435267, -0.0724..."
3,MANILA (Reuters) - Philippine President Rodrig...,1,train,"[[MANILA, Reuters, -, Philippine, President, R...","[[-0.104875244, -0.0045782793, -0.01621029, -0..."
4,MADRID (Reuters) - Catalan pro-independence pa...,1,train,"[[MADRID, Reuters, -, Catalan, pro, -independe...","[[-0.008703686, -0.0016795708, 0.059881095, -0..."


In [27]:
texts = [' '.join(sentence) for document in df['sentences'].to_list() for sentence in document]

In [28]:
embeddings = [embedding.tolist() for document in df['sentence_vectors'] for embedding in document]

In [29]:
assert df.iloc[0, -1][1][0] == embeddings[1][0]

In [30]:
metadatas = [{'class': document['class']} for i, document in df.iterrows() for _ in document['sentences']]

In [31]:
ids = [f'doc_{i}/sen_{j}' for i, document in df.iterrows() for j, s in enumerate(document['sentences'])]

In [32]:
assert len(texts) == len(embeddings) == len(metadatas) == len(ids)

In [33]:
len(ids)

76095

In [34]:
db = ChromaDataBase()

In [35]:
db.add(embeddings, texts, metadatas, ids)

In [36]:
db.collection.count()

76095

In [37]:
result = db.query(
    query_texts=['Catalan pro-independence party'],
    n_results=5
)
result

{'ids': [['doc_4/sen_1',
   'doc_2852/sen_18',
   'doc_3820/sen_10',
   'doc_4/sen_0',
   'doc_3276/sen_8']],
 'distances': [[0.26547086238861084,
   0.33760589361190796,
   0.3949124813079834,
   0.39814186096191406,
   0.4065415859222412]],
 'metadatas': [[{'class': 1},
   {'class': 1},
   {'class': 1},
   {'class': 1},
   {'class': 1}]],
 'embeddings': None,
 'documents': [['Earlier , another Catalan pro -independence party , Esquerra Republicana de Catalunya ERC , also said it would participate in the elections .',
   'On Saturday , Puigdemont - who PDeCAT said on Sunday would lead the party in the election - called for a united Catalan political front for independence from Spain and against the detention of his former members of government .',
   'Knowing that in the end there won t be independence , I feel sorry for all the people tricked into thinking there could be and the divisions they ve driven through Catalan society .',
   'MADRID Reuters - Catalan pro -independence party 

In [38]:
result = db.query(
    query_embeddings=[model.encode('Slovenian forests', normalize_embeddings=True).tolist()],
    n_results=5,
)
result

{'ids': [['doc_1/sen_5',
   'doc_1/sen_1',
   'doc_4736/sen_18',
   'doc_4049/sen_6',
   'doc_1/sen_4']],
 'distances': [[0.4217042922973633,
   0.5989829301834106,
   0.6369528770446777,
   0.6437188386917114,
   0.6440255045890808]],
 'metadatas': [[{'class': 1},
   {'class': 1},
   {'class': 1},
   {'class': 1},
   {'class': 1}]],
 'embeddings': None,
 'documents': [['About 60 percent of Slovenia is covered by forests while the country has some resorts on the Adriatic sea as well as a number of Alpine and spa tourist resorts .',
   '2017 will be the fourth record year in a row for Slovenian tourism after revenues from foreign tourists reached some 2 . 3 billion euros 2 . 71 billion last year , Pocivalsek told a news conference .',
   'Sweden ?',
   'Members of the initiative include Poland , Austria , Hungary and Russia s neighbors Latvia and Estonia .',
   'We received a lot of attention from international journalists because of the U . S . First Lady and we managed to present Slov

In [39]:
result = db.query(
    query_texts=['elections'],
    n_results=5,
    where={'class': 0},
    where_document={'$contains': 'US'}
)
result

{'ids': [['doc_3205/sen_37',
   'doc_3275/sen_2',
   'doc_1922/sen_24',
   'doc_3205/sen_39',
   'doc_3747/sen_16']],
 'distances': [[0.5046716928482056,
   0.53038489818573,
   0.5362838506698608,
   0.5743129849433899,
   0.5999519228935242]],
 'metadatas': [[{'class': 0},
   {'class': 0},
   {'class': 0},
   {'class': 0},
   {'class': 0}]],
 'embeddings': None,
 'documents': [['So while Election Year , comes across as a sometimes silly socio -political horror action film , its purpose may be to condition the public in accepting certain ideas and perhaps even certain political outcomes domestically in the US something which already seems to be happening . Election Year echoes the apparent strategy of tension we are seeing today during America s present 2016 presidential election campaign .',
   'Is this really a rights issue , or a political status quo issue ? In close districts and states , a few hundred votes are enough to swing a result results that could also swing a national ele