In [1]:
import numpy as np 
import pandas as pd
import csv
import os
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

# displays all columns and rows when asked to print
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading all the datasets and extracting only english tweets

In [2]:
# list of all the dataset files
dataset_paths = ["../datasets/russia_052020_tweets_csv_hashed_2.csv", 
         "../datasets/russian_linked_tweets_csv_hashed.csv", 
         "../datasets/ira_tweets_csv_hashed.csv", 
         "../datasets/russia_201906_1_tweets_csv_hashed.csv"]

# path to store the entire combined dataset
combined_dataset_path = "../datasets/russian_trolls.csv"

# returns a pandas dataframe consisting of entries from all the dataset files
def get_combined_dataset(paths):
    data = pd.concat((pd.read_csv(file) for file in tqdm(paths)))
    return data

data = get_combined_dataset(dataset_paths)
print("Number of tweets in the dataset: ", data.shape[0])

# extracts just the english tweets by using the language tag
is_english_tweet = data['tweet_language'] == 'en'
english_data = data[is_english_tweet]

print("Number of English tweets in the dataset: ", english_data.shape[0])
english_tweet_data = english_data[['tweetid', 'tweet_text']]

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
100%|██████████| 4/4 [00:54<00:00, 13.50s/it]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


Number of tweets in the dataset:  9995700
Number of English tweets in the dataset:  3739633


# Preprocessing the tweets to remove mentions, urls and retweet string

In [3]:
def remove_url(tweet):
    result = re.sub(r"http\S+", "", tweet)
    return result

def remove_mentions(tweet):
    result = re.sub(r"@\S+", "", tweet)
    return result

def remove_retweet(tweet):
    result = re.sub(r"RT @\S+", "", tweet)
    return result

# takes list of tweets as input and returns list of pre-processed tweets as output
def preprocess(tweets):
    processed_tweets = []
    for tweet in tweets:
        result = remove_mentions(remove_retweet(remove_url(tweet)))
        processed_tweets.append(result)
    return processed_tweets

tweets = english_tweet_data['tweet_text']
tweets = preprocess(tweets)

english_tweet_data = english_tweet_data.assign(processed_tweets = tweets)

# removes the entries having empty string after preprocessing
is_not_empty_string = english_tweet_data['processed_tweets'].apply(lambda x: not str.isspace(x))
english_tweet_data = english_tweet_data[is_not_empty_string]

english_tweet_data = english_tweet_data.reset_index()

print("Number of english tweets after preprocessing: ", english_tweet_data.shape[0])

Number of english tweets after preprocessing:  3736616


# Obtaining the embeddings for all the tweets using Bert As Service

In [4]:
# obtain the tweet encodings from the tweet texts.
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
# encodings = bc.encode(english_tweet_data['processed_tweets'].to_list())
# print("Number of dimensions in the encodings: ",encodings.shape[1])

# # save the encodings for later use. Order preserved
# np.save('tweet_encodings_flag_true', encodings)

In [5]:
# load the encodings of tweets previously save
loaded_encodings_flag_true = np.load('tweet_encodings_flag_true.npy')
loaded_encodings_flag_false = np.load('tweet_encodings_flag_false.npy')

# Using Faiss to index search to obtain top k similar tweets for given query

In [6]:
import faiss
dimension = 768

## Results for embeddings obtained using mask_cls flag set to True and using IndexFlatL2

In [7]:
index_true = faiss.IndexFlatL2(dimension)
index_true.add(loaded_encodings_flag_true)                  # add encodings to the index
print("Number of entries in the index: ", index_true.ntotal)

# random tweet encoded to query on the index of entire dataset
query_true = bc.encode(english_tweet_data.tail(1)['processed_tweets'].to_list())

Number of entries in the index:  3736616


In [10]:
k = 20                          # number of nearest neighbours to be fetched
D, I = index_true.search(query_true, k)     # actual search

In [11]:
print("The query: ", english_tweet_data['processed_tweets'].iloc[3736615], "\n")
print("The top 10 results obtained were: \n")
for i in I[0]:
    print(english_tweet_data['processed_tweets'].iloc[i])

The query:  You got us! We really did help  

The top 10 results obtained were: 

You got us! We really did help 
 U r welcome! we r spreading out info about police brutality! help us and make the world better!
we all gonna be scared to death!! I hate govt for it! It was planned before!!! #phosphorusdisaster
Do we really need this? I bet we can do better shows, we already have them! #AtlantaFX #Empire 
Unguarded we are now! we all gonna be scared to death!! #phosphorusdisaster
 America is facing GOOD vs EVIL! This election so critical! We pray GOD help us &amp; help America! #MAGA 🚂🇺🇸  
 No amnesty!  We, the American people come first ! Not fair for those who did it right!! #AmericaFirst Democrats are willin…
It`s #Iran who need our help! They must agree on anything we want to give them! #KerryDoSmth
I know, that cops are well trained to shoot! It was planned beforehand!! #CopsWillBeCops
 GET OUT &amp; VOTE for TRUMP like your Country Depends on Him! We Do!! Let's take OUR COUNTRY Back

In [84]:
# top 100 for 1000 queries
def get_nearest_neighbour_matrix(queries,index, k):
    D, I = index.search(queries, k)
    similarity_matrix = pd.DataFrame(I)
    score_matrix = pd.DataFrame(D)
    query_tweets = english_tweet_data['processed_tweets'].iloc[0:len(queries)].tolist()
    nearest_neighbour_matrix = pd.DataFrame().assign(queries = query_tweets)
    rank = 1
    for (col_name, col_data) in similarity_matrix.iteritems():
        indexes = col_data.values
        similar_tweets = english_tweet_data['processed_tweets'].iloc[indexes].tolist()
        nearest_neighbour_matrix['top_'+str(rank)] = similar_tweets
        nearest_neighbour_matrix['scores_'+str(rank)] = score_matrix[rank-1]
        rank = rank + 1
    return nearest_neighbour_matrix
start_time = time.perf_counter()
queries = loaded_encodings_flag_false[0:1000]
start_time = time.perf_counter()
nearest_neighbour_matrix = get_nearest_neighbour_matrix(queries, index_true, 100)
end_time = time.perf_counter()
print("Time taken for execution: ", (end_time - start_time)/60.0 ," minutes")

Time taken for execution:  2.278086434180538  minutes


In [85]:
nearest_neighbour_matrix.to_csv('./nearest_neighbour_matrix_flag_true.csv')

## Results for embeddings obtained using mask_cls flag set to True and using IndexFlatIP

In [86]:
queries = loaded_encodings_flag_false[0:1000]
index_true_flatIP = faiss.IndexFlatIP(dimension)
index_true_flatIP.add(loaded_encodings_flag_true)                  # add encodings to the index
print("Number of entries in the index: ", index_true_flatIP.ntotal)
start_time = time.perf_counter()
nearest_neighbour_matrix = get_nearest_neighbour_matrix(queries, index_true_flatIP, 100)
end_time = time.perf_counter()
print("Time taken for execution: ", (end_time - start_time)/60.0 ," minutes")
nearest_neighbour_matrix.to_csv('./nearest_neighbour_matrix_flag_true_IndexFlatIP.csv')

Number of entries in the index:  3736616
Time taken for execution:  2.1954765672485035  minutes


## Results for embeddings obtained using mask_cls flag set to False and using IndexFlatL2

In [87]:
index_false = faiss.IndexFlatL2(dimension)
index_false.add(loaded_encodings_flag_false)                  # add encodings to the index
print("Number of entries in the index: ", index_false.ntotal)


# random tweet encoded to query on the index of entire dataset
query = np.asarray([loaded_encodings_flag_false[3736615]])

Number of entries in the index:  3736616


In [88]:
k = 20                          # number of nearest neighbours to be fetched
D, I = index_false.search(query, k)     # actual search 

In [89]:
print("The query: ", english_tweet_data['processed_tweets'].iloc[3736615], "\n")
print("The top 10 results obtained were: \n")
for i in I[0]:
    print(english_tweet_data['processed_tweets'].iloc[i])

The query:  You got us! We really did help  

The top 10 results obtained were: 

You got us! We really did help 
 you really got it!
 you really got it!
 We did it, guys 🌮🖖🏼🦄🌪🌯🍾
 You survived! 
   We did it! 
 We got you   
 We got you   
 We did it, ladies 
 APPLAUD YOU! YOU GUYS MADE THIS POSSIBLE! 
   We did it dude!
   We did it dude!
 We did!
We survived thanks obama        
 We made it y'all!
  I appreciated it!
 I got you! 
 I got you! 
 You nailed it!
   YOU nailed it! 


In [90]:
# top 100 for 1000 queries
start_time = time.perf_counter()
queries = loaded_encodings_flag_false[0:1000]
nearest_neighbour_matrix = get_nearest_neighbour_matrix(queries, index_false, 100)
end_time = time.perf_counter()
print("Time taken for execution: ", (end_time - start_time)/60.0 ," minutes")

Time taken for execution:  2.311331648255388  minutes


In [92]:
nearest_neighbour_matrix.to_csv('./nearest_neighbour_matrix_flag_false.csv')

## Results for embeddings obtained using mask_cls flag set to False and using IndexFlatIP

In [93]:
queries = loaded_encodings_flag_false[0:1000]
index_false_flatIP = faiss.IndexFlatIP(dimension)
index_false_flatIP.add(loaded_encodings_flag_false)                  # add encodings to the index
print("Number of entries in the index: ", index_false_flatIP.ntotal)
start_time = time.perf_counter()
nearest_neighbour_matrix = get_nearest_neighbour_matrix(queries, index_false_flatIP, 100)
end_time = time.perf_counter()
print("Time taken for execution: ", (end_time - start_time)/60.0 ," minutes")
nearest_neighbour_matrix.to_csv('./nearest_neighbour_matrix_flag_false_IndexFlatIP.csv')

Number of entries in the index:  3736616
Time taken for execution:  2.377830154076219  minutes


# Computing Cosine similarity using Bert Encodings and comparing its results with that of Faiss

In [59]:
# computes cposine similarity between two vectors
def cosine_similarity(a,b):
    dot = np.dot(a, b)
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    cos = dot / (norma * normb)
    return cos

# returns the list of tweets for given list of indices
def get_list_of_tweets(indexes):
    tweets = []
    for i in indexes[0]:
        tweets.append(english_tweet_data['processed_tweets'].iloc[i])
    return tweets

# normalizes the vectors in ndarray row wise
def normalize_rows(x: np.ndarray):
    return x/np.linalg.norm(x, ord=2, axis=1, keepdims=True)

In [135]:
query_index = 223456

In [136]:
'''constructs a dataframe to compare the scores and results
obtained from Faiss and Cosine similarity score computed on own'''

index_false_flatIP = faiss.IndexFlatIP(dimension)
index_false_flatIP.add(loaded_encodings_flag_false)     
query = np.asarray([loaded_encodings_flag_false[query_index]])
k = 20                          # number of nearest neighbours to be fetched
D, I = index_false_flatIP.search(query, k)     # actual search 
j = 0
faiss_scores = []
for i in I[0]:
    faiss_scores.append(D[0][j])
    j = j + 1
top_20_tweets = get_list_of_tweets(I)
score_comparison = pd.DataFrame().assign(top_20_tweets_from_faiss_without_vector_normalization = top_20_tweets)
score_comparison['Faiss cosine similarity scores'] = faiss_scores

query = np.asarray([loaded_encodings_flag_false[query_index]])
dist_out = 1-pairwise_distances(query, loaded_encodings_flag_false, metric="cosine")
cosine_scores = pd.DataFrame().assign(scores=dist_out[0])
cosine_scores.sort_values(by = ['scores'],inplace = True, ascending = False)
score_comparison['top 20 tweets from computing cosine similarity on own'] = get_list_of_tweets([cosine_scores.head(20).index])
score_comparison['cosine similarity scores'] = cosine_scores['scores'][0:20].tolist()

normalized_encodings = normalize_rows(loaded_encodings_flag_false)
index_false_flatIP = faiss.IndexFlatIP(dimension)
index_false_flatIP.add(normalized_encodings)     
query = np.asarray([normalized_encodings[query_index]])
k = 20                          # number of nearest neighbours to be fetched
D, I = index_false_flatIP.search(query, k)     # actual search 
j = 0
faiss_scores = []
for i in I[0]:
    faiss_scores.append(D[0][j])
    j = j + 1
top_20_tweets = get_list_of_tweets(I)
score_comparison['top 20 tweets from Faiss with normalized vector as index'] = top_20_tweets
score_comparison['scores'] = faiss_scores
score_comparison.to_csv('./result_comparison_between_mask_cls_false_faissIndexIP_and_own_cosine_similarities_3.csv')

In [140]:
'''constructs a dataframe to compare the scores and results
obtained from Faiss and Cosine similarity score computed on own'''

index_true_flatIP = faiss.IndexFlatIP(dimension)
index_true_flatIP.add(loaded_encodings_flag_true)     
query = np.asarray([loaded_encodings_flag_true[query_index]])
k = 20                          # number of nearest neighbours to be fetched
D, I = index_true_flatIP.search(query, k)     # actual search 
j = 0
faiss_scores = []
for i in I[0]:
    faiss_scores.append(D[0][j])
    j = j + 1
top_20_tweets = get_list_of_tweets(I)
score_comparison = pd.DataFrame().assign(top_20_tweets_from_faiss_without_vector_normalization = top_20_tweets)
score_comparison['Faiss cosine similarity scores'] = faiss_scores

query = np.asarray([loaded_encodings_flag_true[query_index]])
dist_out = 1-pairwise_distances(query, loaded_encodings_flag_true, metric="cosine")
cosine_scores = pd.DataFrame().assign(scores=dist_out[0])
cosine_scores.sort_values(by = ['scores'],inplace = True, ascending = False)
score_comparison['top 20 tweets from computing cosine similarity on own'] = get_list_of_tweets([cosine_scores.head(20).index])
score_comparison['cosine similarity scores'] = cosine_scores['scores'][0:20].tolist()

normalized_encodings = normalize_rows(loaded_encodings_flag_true)
index_true_flatIP = faiss.IndexFlatIP(dimension)
index_true_flatIP.add(normalized_encodings)     
query = np.asarray([normalized_encodings[query_index]])
k = 20                          # number of nearest neighbours to be fetched
D, I = index_true_flatIP.search(query, k)     # actual search 
j = 0
faiss_scores = []
for i in I[0]:
    faiss_scores.append(D[0][j])
    j = j + 1
top_20_tweets = get_list_of_tweets(I)
score_comparison['top 20 tweets from Faiss with normalized vector as index'] = top_20_tweets
score_comparison['scores'] = faiss_scores
score_comparison.to_csv('./result_comparison_between_mask_cls_true_faissIndexIP_and_own_cosine_similarities_3.csv')