In [1]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'BankTweets_04242020.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity_TfIdf.ipynb', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'SearchTweets.csv', 'SearchTweets_04242020.csv', 'tweets_04232020.csv', 'tweets_04242020.csv']


In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets.tail()
       

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
5995,1253808235439706114,1160776206280736768,Steel,Steel48653653,coffee,@Facetiouslyfun @IttyTater @PumpkinPatch200 @I...
5996,1253808231559753729,2603663594,James Eman,je_eman77,coffee,RT @joncoopertweets: I literally spit out my c...
5997,1253808227302510593,2868464198,Kim Wagner,KimBWagner,coffee,RT @joncoopertweets: I literally spit out my c...
5998,1253808218909761538,69258603,Noel Jose,noeljose,coffee,RT @mikhailquijano: Literally the sound of my ...
5999,1253808218519863296,412136044,Mafeyisi Molea,Therealmolea,coffee,RT @Writers_Write: 'What on earth could be mor...


In [3]:
df = pd.DataFrame()
df['text'] = df_tweets['Text']
df.tail()

Unnamed: 0,text
5995,@Facetiouslyfun @IttyTater @PumpkinPatch200 @I...
5996,RT @joncoopertweets: I literally spit out my c...
5997,RT @joncoopertweets: I literally spit out my c...
5998,RT @mikhailquijano: Literally the sound of my ...
5999,RT @Writers_Write: 'What on earth could be mor...


In [4]:
total_indices_size = df.shape[0]
train_size = 0.8
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print(total_indices_size, train_indices_size, test_indices_size)

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

6000 4800 1200


In [5]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)
train_set[:5]

(4800, 1) (1200, 1)


Unnamed: 0,text
4320,@jeffersongraham 💙 Stay healthy and safe. We'r...
2006,@joshram @AmericanAir @AskCiti Just found out ...
5689,@lilrayma tokyo bullet is quaking
472,@dubey1312 I apologize for the late response d...
1370,The annual Christmas with the Currys Winter Wo...


## CountVectorizer & Tf-Idf:

In [6]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word',lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [7]:
print(tf_idf_train_matrix.toarray())
print("Vector Shape:", tf_idf_test_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vector Shape: (1200, 72660)


In [8]:
# Sanity check
cosine_similarities = cosine_similarity(tf_idf_test_matrix[2], tf_idf_train_matrix).flatten()
top_Indices = cosine_similarities.argsort()[::-1][:1].item(0)
top_tweet_Indices = train_indices[top_Indices]

print("\nInput Text:\n", df_tweets.loc[test_indices[2], "Text"], "\n")
print("\nSimilar Text:\n", df_tweets.loc[top_tweet_Indices, "Text"], "\n")


Input Text:
 Why are we not getting a paper towel or something to prevent it from going into the wall? Why are we just recording? Also, why the fuck hasnt your dog Beeb let out to pee? https://t.co/cz575yU0RT 


Similar Text:
 RT @fkajack: fuck it... dog vibin’ to kylie minogue https://t.co/ClMDR6rR9d 



In [9]:
def find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets):
    similar_text_list = []
    similar_score_list = []
    similar_theme_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_index =  cosine_similarities.argsort()[::-1][:1].item(0)
        top_tweet_Index = train_indices[similar_doc_index]
        similar_text_list.append(df_tweets.loc[top_tweet_Index, "Text"])
        similar_score_list.append(cosine_similarities[similar_doc_index])
        similar_theme_list.append(df_tweets.loc[top_tweet_Index, "Theme"])    
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_text"] = similar_text_list
    df["similar_score"] = similar_score_list     
    df["similar_theme"] = similar_theme_list
    return df

In [10]:
similar_test_df = find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets)
similar_test_df

Unnamed: 0,text,original_theme,similar_text,similar_score,similar_theme
5378,RT @FantasyDoggo: @Bypbap (Reuploaded cus I ha...,dog,Arnette is a dog,0.287913,dog
5980,love that i currently dont have to pay for gas...,coffee,RT @jimmyzrz: If its to hot for you to be outs...,0.123564,dog
5395,Why are we not getting a paper towel or someth...,dog,RT @fkajack: fuck it... dog vibin’ to kylie mi...,0.152034,dog
239,@tamihonesty2 We know frustrating this experie...,Bank/Financial,@tamihonesty2 We know frustrating this experie...,1.000000,Bank/Financial
5353,RT @chico_dykey: that is alot of pee. i know t...,dog,RT @chico_dykey: that is alot of pee. i know t...,1.000000,dog
542,"@hanzield Hi there, I wanted to check back in ...",Bank/Financial,"@CountryBroad35 Hi there, I wanted to check ba...",0.936144,Bank/Financial
2725,RT @MikeDentale: Airlines sending me “we’re in...,airlines,RT @MikeDentale: Airlines sending me “we’re in...,1.000000,airlines
2454,Carson Wentz should take a massive shit on the...,car,My car is on her last life 😔💔,0.119688,car
1459,"@1029Mo Hello, I received your DM and will res...",Bank/Financial,"@1029Mo Hello, I received your DM and will res...",1.000000,Bank/Financial
3867,@kylejenks1 What songs are on your road trip p...,Bank/Financial,@kylejenks1 What songs are on your road trip p...,1.000000,Bank/Financial


In [11]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["similar_theme"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 90.5%
