In [1]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'BankTweets_04242020.csv', 'BankTweets_04252020.csv', 'BankTweets_04262020.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity.ipynb', 'documentSimilarity_TfIdf.ipynb', 'documentSimilarity_TfIdf.ipynb.txt', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'SearchTweets.csv', 'SearchTweets_04242020.csv', 'SearchTweets_04252020.csv', 'SearchTweets_04262020.csv', 'tweets_04232020.csv', 'tweets_04242020.csv', 'tweets_04252020.csv', 'tweets_04262020.csv']


In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv', './tweets_04252020.csv', './tweets_04262020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets       

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...
...,...,...,...,...,...,...
11995,1254444893642149890,1230136435237363713,TRUMP IS THE COVID19 MURDERER!,sellers_hank,coffee,@secupp @sarah310 Putting that sugary crap in ...
11996,1254444893096808449,32665279,¯\_(ツ)_/¯ Anonymous,tildejac,coffee,@Grady_Booch @ADMGoldfish Just don't knock ove...
11997,1254444890936602625,15063195,Jonathan Middleton,jpmiddleton,coffee,RT @joncoopertweets: I literally spit out my c...
11998,1254444888516632578,77998665,cocotu,cocotucafe,coffee,You have to visit Quillabamba in Peru. Beautif...


In [3]:
df_tweets.drop_duplicates(["Text", "User_Id"], inplace=True) # remove duplicates in place and reset index
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.shape

(6498, 6)

In [4]:
df = pd.DataFrame()
df['text'] = df_tweets['Text']
df.tail()

Unnamed: 0,text
6493,@secupp @sarah310 Putting that sugary crap in ...
6494,@Grady_Booch @ADMGoldfish Just don't knock ove...
6495,RT @joncoopertweets: I literally spit out my c...
6496,You have to visit Quillabamba in Peru. Beautif...
6497,RT @spacecowbot: let it be known that i do not...


In [5]:
total_indices_size = df.shape[0]
train_size = 0.8
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print(total_indices_size, train_indices_size, test_indices_size)

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

6498 5198 1300


In [6]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)
train_set[:5]

(5198, 1) (1300, 1)


Unnamed: 0,text
5602,@Citi Baloney! Worst of the worst. @Citi your...
3609,RT @MaliikObee: ESPN: “They take him with the ...
3520,RT @Paradise_pups: The dog walks you https://t...
1622,If you applied through Small Business Online B...
2443,"@AIRlunchbox @TIDAL @TMobile Nope not you,I am..."


## CountVectorizer & Tf-Idf:

In [7]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word',lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [8]:
print(tf_idf_train_matrix.toarray())
print("Vector Shape:", tf_idf_test_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vector Shape: (1300, 99874)


In [9]:
# Sanity check
cosine_similarities = cosine_similarity(tf_idf_test_matrix[2], tf_idf_train_matrix).flatten()
top_Indices = cosine_similarities.argsort()[::-1][:1].item(0)
top_tweet_Indices = train_indices[top_Indices]

print("\nInput Text:\n", df_tweets.loc[test_indices[2], "Text"], "\n")
print("\nSimilar Text:\n", df_tweets.loc[top_tweet_Indices, "Text"], "\n")


Input Text:
 @PSCC_Tualatin Hi Jonathan, the applications are held in submission order for processing if additional funding becomes available from Congress. Thank you for your continued patience. -EC 


Similar Text:
 @Addvalue2others Hi, the applications are held in submission order for processing if additional funding becomes available from Congress. Thank you for your continued patience. -EC 



In [10]:
def find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets):
    similar_text_list = []
    similar_score_list = []
    similar_theme_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_index =  cosine_similarities.argsort()[::-1][:1].item(0)
        top_tweet_Index = train_indices[similar_doc_index]
        similar_text_list.append(df_tweets.loc[top_tweet_Index, "Text"])
        similar_score_list.append(cosine_similarities[similar_doc_index])
        similar_theme_list.append(df_tweets.loc[top_tweet_Index, "Theme"])    
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_text"] = similar_text_list
    df["similar_score"] = similar_score_list     
    df["similar_theme"] = similar_theme_list
    return df

In [11]:
similar_test_df = find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets)
similar_test_df

Unnamed: 0,text,original_theme,similar_text,similar_score,similar_theme
2725,RT @MikeDentale: Airlines sending me “we’re in...,airlines,RT @MikeDentale: Airlines sending me “we’re in...,1.000000,airlines
2454,@Yo_ItsMONROE EVERY! One. The one where i live...,car,Red velvet cake turn out good 😋,0.132593,cake
1459,"@PSCC_Tualatin Hi Jonathan, the applications a...",Bank/Financial,"@Addvalue2others Hi, the applications are held...",0.845090,Bank/Financial
3867,RT @AsemWORLD: Freestyle Friday S2Ep11 \nClima...,tokyo,RT @AsemWORLD: Freestyle Friday S2Ep11 \nClima...,1.000000,tokyo
1903,#Citi appoints Kristine Braden as Head of the ...,Bank/Financial,Citi names Peter Crawley as Head of Treasury a...,0.177375,Bank/Financial
...,...,...,...,...,...
4931,A team at NASA developed a ventilator tailored...,nasa,NASA team developed a ventilator tailored for ...,0.715347,nasa
3264,RT @Citi: Jane Fraser discusses how business l...,@Citi,RT @Citi: Jane Fraser discusses how business l...,1.000000,@Citi
1653,Clients and small business owners: if you've b...,Bank/Financial,@BreakJuicery Find the latest updates on the P...,0.198989,Bank/Financial
2607,isaiah got him and i matching tokyo ghoul shir...,tokyo,Ima rewatch Tokyo ghoul,0.333418,tokyo


In [12]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["similar_theme"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 81.69%
