In [1]:
import numpy as np 
import pandas as pd
import csv
import os
import re
import matplotlib.pyplot as plt
from tqdm import tqdm

# displays all columns and rows when asked to print
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading all the datasets and extracting only english tweets

In [2]:
# list of all the dataset files
dataset_paths = ["../datasets/russia_052020_tweets_csv_hashed_2.csv", 
         "../datasets/russian_linked_tweets_csv_hashed.csv", 
         "../datasets/ira_tweets_csv_hashed.csv", 
         "../datasets/russia_201906_1_tweets_csv_hashed.csv"]

# path to store the entire combined dataset
combined_dataset_path = "../datasets/russian_trolls.csv"

# returns a pandas dataframe consisting of entries from all the dataset files
def get_combined_dataset(paths):
    data = pd.concat((pd.read_csv(file) for file in tqdm(paths)))
    return data

data = get_combined_dataset(dataset_paths)
print("Number of tweets in the dataset: ", data.shape[0])

# extracts just the english tweets by using the language tag
is_english_tweet = data['tweet_language'] == 'en'
english_data = data[is_english_tweet]

print("Number of English tweets in the dataset: ", english_data.shape[0])
english_tweet_data = english_data[['tweetid', 'tweet_text']]

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
100%|██████████| 4/4 [00:55<00:00, 13.77s/it]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


Number of tweets in the dataset:  9995700
Number of English tweets in the dataset:  3739633


# Preprocessing the tweets to remove mentions, urls and retweet string

In [3]:
def remove_url(tweet):
    result = re.sub(r"http\S+", "", tweet)
    return result

def remove_mentions(tweet):
    result = re.sub(r"@\S+", "", tweet)
    return result

def remove_retweet(tweet):
    result = re.sub(r"RT @\S+", "", tweet)
    return result

# takes list of tweets as input and returns list of pre-processed tweets as output
def preprocess(tweets):
    processed_tweets = []
    for tweet in tweets:
        result = remove_mentions(remove_retweet(remove_url(tweet)))
        processed_tweets.append(result)
    return processed_tweets

tweets = english_tweet_data['tweet_text']
tweets = preprocess(tweets)

english_tweet_data = english_tweet_data.assign(processed_tweets = tweets)

# removes the entries having empty string after preprocessing
is_not_empty_string = english_tweet_data['processed_tweets'].apply(lambda x: not str.isspace(x))
english_tweet_data = english_tweet_data[is_not_empty_string]

english_tweet_data = english_tweet_data.reset_index()

print("Number of english tweets after preprocessing: ", english_tweet_data.shape[0])

Number of english tweets after preprocessing:  3736616


# Obtaining the embeddings for all the tweets using Bert As Service

In [7]:
# obtain the tweet encodings from the tweet texts.
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
encodings = bc.encode(english_tweet_data['processed_tweets'].to_list())
print("Number of dimensions in the encodings: ",encodings.shape[1])

# save the encodings for later use. Order preserved
np.save('tweet_encodings_flag_true', encodings)

In [13]:
# load the encodings of tweets previously save
loaded_encodings_flag_true = np.load('tweet_encodings_flag_true.npy')
loaded_encodings_flag_false = np.load('tweet_encodings_flag_false.npy')

# Using Faiss to index search to obtain top k similar tweets for given query

In [5]:
import faiss
dimension = 768

## Results for embeddings obtained using mask_cls flag set to True

In [8]:
index_true = faiss.IndexFlatL2(dimension)
index_true.add(loaded_encodings_flag_true)                  # add encodings to the index
print("Number of entries in the index: ", index_true.ntotal)

# random tweet encoded to query on the index of entire dataset
query = bc.encode(english_tweet_data.tail(1)['processed_tweets'].to_list())

Number of entries in the index:  3736616


In [10]:
k = 10                          # number of nearest neighbours to be fetched
D, I = index_true.search(query, k)     # actual search

In [11]:
print("The query: ", english_tweet_data['processed_tweets'].iloc[3736615], "\n")
print("The top 10 results obtained were: \n")
for i in I[0]:
    print(english_tweet_data['processed_tweets'].iloc[i])

The query:  You got us! We really did help  

The top 10 results obtained were: 

You got us! We really did help 
 U r welcome! we r spreading out info about police brutality! help us and make the world better!
we all gonna be scared to death!! I hate govt for it! It was planned before!!! #phosphorusdisaster
Do we really need this? I bet we can do better shows, we already have them! #AtlantaFX #Empire 
Unguarded we are now! we all gonna be scared to death!! #phosphorusdisaster
 America is facing GOOD vs EVIL! This election so critical! We pray GOD help us &amp; help America! #MAGA 🚂🇺🇸  
 No amnesty!  We, the American people come first ! Not fair for those who did it right!! #AmericaFirst Democrats are willin…
It`s #Iran who need our help! They must agree on anything we want to give them! #KerryDoSmth
I know, that cops are well trained to shoot! It was planned beforehand!! #CopsWillBeCops
 GET OUT &amp; VOTE for TRUMP like your Country Depends on Him! We Do!! Let's take OUR COUNTRY Back

## Results for embeddings obtained using mask_cls flag set to False

In [14]:
index_false = faiss.IndexFlatL2(dimension)
index_false.add(loaded_encodings_flag_false)                  # add encodings to the index
print("Number of entries in the index: ", index_false.ntotal)


# random tweet encoded to query on the index of entire dataset
query = np.asarray([loaded_encodings_flag_false[3736615]])

Number of entries in the index:  3736616


In [15]:
k = 10                          # number of nearest neighbours to be fetched
D, I = index_false.search(query, k)     # actual search 

In [16]:
print("The query: ", english_tweet_data['processed_tweets'].iloc[3736615], "\n")
print("The top 10 results obtained were: \n")
for i in I[0]:
    print(english_tweet_data['processed_tweets'].iloc[i])

The query:  You got us! We really did help  

The top 10 results obtained were: 

You got us! We really did help 
 you really got it!
 you really got it!
 We did it, guys 🌮🖖🏼🦄🌪🌯🍾
 You survived! 
   We did it! 
 We got you   
 We got you   
 We did it, ladies 
 APPLAUD YOU! YOU GUYS MADE THIS POSSIBLE! 
