In [1]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'BankTweets_04242020.csv', 'BankTweets_04252020.csv', 'BankTweets_04262020.csv', 'BankTweets_04292020.csv', 'BankTweets_04302020.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity.ipynb', 'documentSimilarity_TfIdf.ipynb', 'documentSimilarity_TfIdf.ipynb.txt', 'documentSimilarity_TfIdf_v2.ipynb', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'model_save', 'SearchTweets.csv', 'SearchTweets_04242020.csv', 'SearchTweets_04252020.csv', 'SearchTweets_04262020.csv', 'SearchTweets_04292020.csv', 'SearchTweets_04302020.csv', 'test_vectors.npy', 'TfIdf_similar_test_tweets.csv', 'TfIdf_v2_similar_test_tweets.csv', 'train_vectors.npy', 'tweets_04232020.csv', 'tweets_04242020.csv', 'tweets_04252020.csv', 'tweets_04262020.csv', 'tweets_04292020.csv', 'tweets_04302020.csv']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vencxbikkanur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv', './tweets_04252020.csv', './tweets_04262020.csv', './tweets_04292020.csv', './tweets_04302020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets.head()     

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [3]:
df_tweets.drop_duplicates(["Text"], inplace=True) # remove duplicates in place and reset index
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.shape

(7700, 6)

## Preprocess data: 

In [4]:
query_index = 10
df = pd.DataFrame()
df['text'] = df_tweets['Text']
print('{:=^100}\n\n {}'.format(' Raw text ', df.text.loc[query_index])) 

df['text'] = df['text'].str.replace('http\S+', '') # removing URLs
df['text'] = df['text'].str.replace('[^A-Za-z0-9]+', ' ') # retain only alphanumeric
df['text'] = df['text'].map(lambda x: WordNetLemmatizer().lemmatize(x)) # lemmatization
df['text'] = df['text'].map(lambda x: x.lower()) # to lower case

print('\n{:=^100}\n\n {}'.format(' Preprocessed text ', df.text.loc[query_index])) 


 To our heroes going out into the world each day – thank you. Citi is supporting COVID-19 relief efforts around the world to help those bravely showing up for the rest of us. https://t.co/oaknGpLLlX https://t.co/wRZ7o6lupM


 to our heroes going out into the world each day thank you citi is supporting covid 19 relief efforts around the world to help those bravely showing up for the rest of us 


In [5]:
total_indices_size = df.shape[0]
train_size = 0.8
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print('Total records: {} \nTrain records: {} \n Test records: {}'.format(total_indices_size, train_indices_size, test_indices_size))

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

Total records: 7700 
Train records: 6160 
 Test records: 1540


In [6]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)

(6160, 1) (1540, 1)


## CountVectorizer & Tf-Idf:

In [7]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word', \
                                   lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [8]:
print("Train Matrix Shape:", tf_idf_train_matrix.shape)

Train Matrix Shape: (6160, 126789)


In [9]:
def print_similar_texts(query_text, query_text_vector,  train_matrix, train_indices, df_tweets, n = 5):
    cosine_similarities_n = cosine_similarity(query_text_vector, train_matrix).flatten()
    top_Indices = cosine_similarities_n.argsort()[::-1][:n]
    top_tweet_Indices = train_indices[top_Indices]
    
    print('\nInput Text:\n {} \n'.format(query_text))
    for index, sim_text in enumerate(df_tweets.loc[top_tweet_Indices, "Text"]):
        print('=' * 30, 'Similar Text: {} || Similar Score: {}'.format(index+1, np.round(cosine_similarities_n[top_Indices[index]], 3)), '=' * 30, '\n')
        print(sim_text, '\n')

In [10]:
query_index = 33
query_text = df_tweets.loc[test_indices[query_index], "Text"]
query_text_vector = tf_idf_test_matrix[query_index]
print_similar_texts(query_text, query_text_vector, tf_idf_train_matrix, train_indices, df_tweets, n = 5)


Input Text:
 NASA Presents Trump with NEW Space Grade Ventilator and Fog Disinfectant... https://t.co/ZKQN8FtLXQ via @YouTube 


NASA Presents Trump with NEW Space Grade Ventilator and Fog Disinfectant... https://t.co/cDKqTVyD0d via @YouTube 


RT @IrmaBel53130008: NASA Presents Trump with NEW Space Grade Ventilator &amp; Fog Disinfectant to Combat the Coronavirus!  How great will this… 


RT @Wyn1745: NASA Presents Trump with NEW Space Grade Ventilator &amp; Fog Disinfectant to Combat #Coronavirus- #ChinaVirus #COVID19

This will… 


😎 NASA presents @realDonaldTrump
with Space grade Ventilator!

https://t.co/M0VweCsPxQ 


@ClaudiaMyTime @ABC Trump Steaks, Trump University, Trump Ice, Trump Airlines, Trump Magazine...should I keep going? 🤦 



In [11]:
def find_similar_n(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets, n=5):
    similar_texts_list = []
    similar_scores_list = []
    similar_top_score_list = []
    similar_themes_list = []
    similar_top_themes_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities_n = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_indices =  cosine_similarities_n.argsort()[::-1][:n]
        top_tweet_Indices = train_indices[similar_doc_indices]
        
        similar_texts = [text for text in df_tweets.loc[top_tweet_Indices, "Text"]]
        similar_texts_list.append(similar_texts)
        
        similar_scores = [np.round(score, 3) for score in cosine_similarities_n[similar_doc_indices]]
        similar_scores_list.append(similar_scores)
        similar_top_score_list.append(similar_scores[0])
                                                                   
        similar_themes = [theme for theme in df_tweets.loc[top_tweet_Indices, "Theme"]]
        similar_themes_list.append(similar_themes)
        
        top_similar_theme = max(set(similar_themes), key=lambda x: similar_themes.count(x))
        similar_top_themes_list.append(top_similar_theme)
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_texts"] = similar_texts_list
    df["similar_scores"] = similar_scores_list  
    df["top_similar_score"] = similar_top_score_list
    df["similar_themes"] = similar_themes_list
    df["top_similar_themes"] = similar_top_themes_list
    return df

In [21]:
similar_test_df = find_similar_n(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets, n=20)
similar_test_df.head()

Unnamed: 0,text,original_theme,similar_texts,similar_scores,top_similar_score,similar_themes,top_similar_themes
4648,catnip coffee you dont have to give me a new ...,coffee,"[Coffee https://t.co/Cp4Qq0K9Ad, love that i c...","[0.235, 0.155, 0.141, 0.138, 0.137, 0.136, 0.1...",0.235,"[coffee, coffee, coffee, dog, coffee, ramadan,...",coffee
362,candicontreras7 hello thank you for tweeting ...,Bank/Financial,"[@LarkBACK Hello, thank you for tweeting us. S...","[0.786, 0.785, 0.785, 0.783, 0.783, 0.782, 0.7...",0.786,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
4645,teekhi mirchh harriisqureshi lemme healuh maj...,coffee,"[Coffee https://t.co/Cp4Qq0K9Ad, RT @NajeemHan...","[0.24, 0.162, 0.149, 0.148, 0.14, 0.123, 0.113...",0.24,"[coffee, ramadan, covid-19, Bank/Financial, co...",coffee
2086,rt trumpwarroom are you kidding me michigan de...,covid-19,[RT @JCAPoquoson: Punished for telling the tru...,"[0.618, 0.213, 0.149, 0.107, 0.097, 0.094, 0.0...",0.618,"[covid-19, covid-19, covid-19, covid-19, nasa,...",covid-19
2988,state rep whitsett nearly died from coronaviru...,covid-19,[RT @GOPChairwoman: State Rep. Whitsett nearly...,"[0.801, 0.065, 0.058, 0.057, 0.051, 0.049, 0.0...",0.801,"[covid-19, covid-19, @Citi, covid-19, cake, ca...",covid-19


In [22]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["top_similar_themes"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 85.84%


In [23]:
similar_test_df.to_csv("TfIdf_v2_similar_test_tweets.csv")