# Laboratory work #5 (vector database search)

In [34]:
import pandas as pd
import re
from pathlib import Path

from tqdm import tqdm
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

tqdm.pandas()

import chromadb

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
random_seed = 42

The dataset reading is taken from https://www.kaggle.com/code/therealsampat/fake-news-detection.

In [3]:
df_fake = pd.read_csv('../data/Fake.csv')
df_true = pd.read_csv('../data/True.csv')

df_fake['class'] = 0
df_true['class'] = 1

In [4]:
df_fake = df_fake.drop_duplicates('text')
df_true = df_true.drop_duplicates('text')

In [5]:
df_fake.shape, df_true.shape

((17455, 5), (21192, 5))

In [6]:
df_merge = pd.concat([df_fake, df_true], axis=0)
df_merge.sample(10)

Unnamed: 0,title,text,subject,date,class
3079,Stephen King OBLITERATES Trump For Throwing A...,Donald Trump disgusted thousands of Americans ...,News,"January 10, 2017",0
18598,Thirty-eight injured in police charges in Cata...,MADRID (Reuters) - Emergency services have att...,worldnews,"October 1, 2017",1
15278,Possible Putin-Trump meeting in Vietnam still ...,MOSCOW (Reuters) - The Kremlin said on Thursda...,worldnews,"November 9, 2017",1
101,Trump strategy document says Russia meddles in...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 18, 2017",1
420,"CNN And MSNBC Destroy Trump, Black Out His Fa...",Donald Trump practically does something to cri...,News,"September 7, 2017",0
13388,STREET ARTIST Censored For Painting Of Hillary...,Censorship is alive and well when it comes to ...,politics,"Jul 30, 2016",0
17688,Trump resists pressure to soften stance on Ira...,WASHINGTON (Reuters) - President Donald Trump ...,worldnews,"October 12, 2017",1
18077,Conservative plotters told to get behind UK's ...,LONDON (Reuters) - Lawmakers in British Prime ...,worldnews,"October 7, 2017",1
11032,America's angry voters divvied up by Trump and...,"IOWA CITY, Iowa/NEW YORK (Reuters) - The 2016 ...",politicsNews,"January 30, 2016",1
9088,Potential Boeing Iran sale faces opposition in...,WASHINGTON (Reuters) - Two senior Republican H...,politicsNews,"June 17, 2016",1


In [7]:
df = df_merge.drop(['title', 'subject', 'date'], axis=1)
df.isnull().sum()

text     0
class    0
dtype: int64

In [8]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [9]:
df['part'] = 'train'
n = len(df)
train_n = int(n * 0.8)
val_n = int(n * 0.1)
test_n = n - train_n - val_n
df.loc[(train_n < df.index) & (df.index < train_n + val_n), 'part'] = 'val'
df.loc[train_n + val_n <= df.index, 'part'] = 'test'

In [10]:
df = df.sample(frac=1, random_state=random_seed)
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

In [11]:
df.head(10)

Unnamed: 0,text,class,part
0,Trump and the House GOP suffered a stunning de...,0,test
1,"LJUBLJANA (Reuters) - Slovenia, birth country ...",1,train
2,"During the 2016 presidential campaign, Preside...",0,test
3,MANILA (Reuters) - Philippine President Rodrig...,1,train
4,MADRID (Reuters) - Catalan pro-independence pa...,1,train
5,MADRID (Reuters) - More than half of Spanish v...,1,train
6,DUBAI (Reuters) - The United Arab Emirates is ...,1,train
7,"WINSTON-SALEM, N.C. (Reuters) - Minority voter...",1,train
8,CARACAS (Reuters) - Venezuelan President Nicol...,1,test
9,WASHINGTON/LONDON (Reuters) - Republican presi...,1,train


In [12]:
len(df[df['part'] == 'train']), len(df[df['part'] == 'val']), len(df[df['part'] == 'test'])

(30918, 3863, 3866)

In [13]:
df.head(10).to_csv('../data/sample.csv', index=None)

In [14]:
for index, row in df.iterrows():
    if index > 10:
        break
    print(index, row['text'], row['class'], '\n')

0 Trump and the House GOP suffered a stunning defeat on legislation they promised would deliver better healthcare to millions, despite that promise being a lie. They pulled the Trumpcare bill from the floor early Friday afternoon, before getting a vote, because they knew it wouldn t pass. And that was absolutely humiliating for them.But what may be more humiliating is what the American Action Network a conservative PAC did on Friday night   the exact same day the bill failed. During Friday night basketball games, they ran television ads about the triumph of Trumpcare over Obamacare, saying the following: Republicans are keeping their promise with a new plan for better hrealthcare, More choices and lower costs, putting doctors and patients in charge again. No more big government penalties or job killing mandates. They go on to explain that the tax credits contained within Trumpcare would make healthcare more affordable, and that protections for people with pre-existing conditions would 

In [15]:
example_text = df.iloc[df[df['text'] == 'Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit '].index, 0].values[0]
print(example_text)

Boos and chants of  Lock her up!  were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit 


We need to take care about names (i.e. @jamiedupree should be treated as separate token), hash tags (#Inauguration is one token here). Also let's say that we want to keep web sites as one token (pic.twitter.com/APVtyyYote or https://t.co/1dvY5lxdKo).

In [16]:
def split_into_sentences(text):
    # so the website will not split into two separate sentences by comma:
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|[#])')
    sentences = sentence_endings.split(text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

sentences = split_into_sentences(example_text)
for sentence in sentences:
    print(sentence)

Boos and chants of  Lock her up!
were heard in the crowd assembled at the West Front of the U.S. Capitol Friday morning when defeated Democratic Party presidential nominee Hillary Clinton was introduced at the inaugural ceremony for President-elect Donald Trump.
#InaugurationDay Lock her up pic.twitter.com/APVtyyYote  Bill Simms (@Mittens1245) January 20, 2017The crowd on the mall booed when the jumbotron showed a close-up shot of Hillary Clinton at #Inauguration https://t.co/1dvY5lxdKo  gpbnews (@gpbnews) January 20, 2017Some in crowd chanting LOCK HER UP as Hillary Clinton arrives  Jamie Dupree (@jamiedupree) January 20, 2017Via: Gateway Pundit


In [17]:
def split_into_words(sentences):
    # Regular expression to match URLs, hashtags, handles, words, and standalone punctuation
    word_pattern = re.compile(r'pic.twitter.com/\S+|https?://\S+|www\.\S+|\#\S+|\@\w+|\b\w+\'?\w*|[\w\'-]+|[.,!?;]')
    tokenized_sentences = []
    for sentence in sentences:
        words = word_pattern.findall(sentence)
        tokenized_sentences.append(words)
    return tokenized_sentences

tokenized = split_into_words(sentences)
for tokens in tokenized:
    print(tokens)

['Boos', 'and', 'chants', 'of', 'Lock', 'her', 'up', '!']
['were', 'heard', 'in', 'the', 'crowd', 'assembled', 'at', 'the', 'West', 'Front', 'of', 'the', 'U', '.', 'S', '.', 'Capitol', 'Friday', 'morning', 'when', 'defeated', 'Democratic', 'Party', 'presidential', 'nominee', 'Hillary', 'Clinton', 'was', 'introduced', 'at', 'the', 'inaugural', 'ceremony', 'for', 'President', '-elect', 'Donald', 'Trump', '.']
['#InaugurationDay', 'Lock', 'her', 'up', 'pic.twitter.com/APVtyyYote', 'Bill', 'Simms', '@Mittens1245', 'January', '20', ',', '2017The', 'crowd', 'on', 'the', 'mall', 'booed', 'when', 'the', 'jumbotron', 'showed', 'a', 'close', '-up', 'shot', 'of', 'Hillary', 'Clinton', 'at', '#Inauguration', 'https://t.co/1dvY5lxdKo', 'gpbnews', '@gpbnews', 'January', '20', ',', '2017Some', 'in', 'crowd', 'chanting', 'LOCK', 'HER', 'UP', 'as', 'Hillary', 'Clinton', 'arrives', 'Jamie', 'Dupree', '@jamiedupree', 'January', '20', ',', '2017Via', 'Gateway', 'Pundit']


In [21]:
def process_text(text):
    sentences = split_into_sentences(text)
    tokenized = split_into_words(sentences)
    return tokenized

In [22]:
df['sentences'] = df['text'].apply(process_text)

In [23]:
df

Unnamed: 0,text,class,part,sentences
0,Trump and the House GOP suffered a stunning de...,0,test,"[[Trump, and, the, House, GOP, suffered, a, st..."
1,"LJUBLJANA (Reuters) - Slovenia, birth country ...",1,train,"[[LJUBLJANA, Reuters, -, Slovenia, ,, birth, c..."
2,"During the 2016 presidential campaign, Preside...",0,test,"[[During, the, 2016, presidential, campaign, ,..."
3,MANILA (Reuters) - Philippine President Rodrig...,1,train,"[[MANILA, Reuters, -, Philippine, President, R..."
4,MADRID (Reuters) - Catalan pro-independence pa...,1,train,"[[MADRID, Reuters, -, Catalan, pro, -independe..."
...,...,...,...,...
38642,"DETROIT (Reuters) - Recording star Kid Rock, a...",1,train,"[[DETROIT, Reuters, -, Recording, star, Kid, R..."
38643,He threw the reputation of the FBI under the ...,0,train,"[[He, threw, the, reputation, of, the, FBI, un..."
38644,"Omarosa Manigault, a senior staff member of Pr...",0,test,"[[Omarosa, Manigault, ,, a, senior, staff, mem..."
38645,After Roy Moore s ugly loss in the Alabama Sen...,0,train,"[[After, Roy, Moore, s, ugly, loss, in, the, A..."


In [25]:
# df_train, df_test = train_test_split(df, test_size=0.2, random_state=random_seed)

In [26]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').to('cuda')

In [28]:
def vectorize_sentences(sentences):
    joined_sentences = [' '.join(sentence) for sentence in sentences]
    return model.encode(joined_sentences)

In [36]:
df['sentence_vectors'] = df['sentences'].progress_apply(vectorize_sentences)

100%|██████████| 38647/38647 [05:04<00:00, 127.11it/s]


In [37]:
df.iloc[0, -1].shape

(8, 384)

In [45]:
df.to_csv('df.csv')