In [6]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity_TfIdf-Copy1.ipynb', 'documentSimilarity_TfIdf.ipynb', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'SearchTweets.csv', 'tweets_04232020.csv']


In [2]:
df_tweets = pd.read_csv('./tweets_04232020.csv', sep=',',header=0, encoding='utf-8', index_col = 0)
df_tweets.head()

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [3]:
df = pd.DataFrame()
df['text'] = df_tweets['Text']
df.tail()

Unnamed: 0,text
2995,RT @fairycakes: No one: \n\nArticle about an a...
2996,RT @Mom_Overboard: Mom culture is putting a co...
2997,RT @soxloverhk: Coffee break https://t.co/kamo...
2998,Next is decaf you brew it just like coffee.
2999,coffee tastes so good at night🖤


In [4]:
total_indices = df_tweets.shape[0]
total_indices_array = np.array([x for x in range(total_indices)])
np.random.seed(0)
np.random.shuffle(total_indices_array)
print(total_indices_array[:5])


train_indices, test_indices = total_indices_array[:2700], total_indices_array[2700:]
print(train_indices[:5])

[ 311 1025 1587 2941 2980]
[ 311 1025 1587 2941 2980]


In [5]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)
train_set[:5]

(2700, 1) (300, 1)


Unnamed: 0,text
311,@dane206 Good Morning. Please call us directly...
1025,@Carmen67336604 We apologize for the higher th...
1587,"@siehavas We apologize for how you're feeling,..."
2941,@leeoralexandra I did 60 hrs a couple years ag...
2980,Whipped coffee is the best thing I never knew ...


## CountVectorizer & Tf-Idf:

In [20]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word',lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [21]:
print(tf_idf_train_matrix.toarray())
print("Vector Shape:", tf_idf_test_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vector Shape: (300, 55352)


In [23]:
# Sanity check
cosine_similarities = cosine_similarity(tf_idf_test_matrix[2], tf_idf_train_matrix).flatten()
top_Indices = cosine_similarities.argsort()[::-1][:1].item(0)
top_tweet_Indices = train_indices[top_Indices]

print("\nInput Text:\n", df_tweets.loc[test_indices[2], "Text"], "\n")
print("\nSimilar Text:\n", df_tweets.loc[top_tweet_Indices, "Text"], "\n")


Input Text:
 @carinad95 @HiltonHotels you're welcome, thank you for your service. 


Similar Text:
 @j_rice5 you're welcome, thank you for sharing your feedback! ^A 



In [24]:
def find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets):
    similar_text_list = []
    similar_score_list = []
    similar_theme_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_index =  cosine_similarities.argsort()[::-1][:1].item(0)
        top_tweet_Index = train_indices[similar_doc_index]
        similar_text_list.append(df_tweets.loc[top_tweet_Index, "Text"])
        similar_score_list.append(cosine_similarities[similar_doc_index])
        similar_theme_list.append(df_tweets.loc[top_tweet_Index, "Theme"])    
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_text"] = similar_text_list
    df["similar_score"] = similar_score_list     
    df["similar_theme"] = similar_theme_list
    return df

In [25]:
similar_test_df = find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets)
similar_test_df

Unnamed: 0,text,original_theme,similar_text,similar_score,similar_theme
809,"The health &amp; well-being of our customers, ...",Bank/Financial,We are here to help our customers impacted by ...,0.169773,Bank/Financial
433,"@thingblue Thanks for reaching out, if any oth...",Bank/Financial,"@itsChrisWallace Thanks for reaching out, if a...",0.875699,Bank/Financial
1107,"@carinad95 @HiltonHotels you're welcome, thank...",Bank/Financial,"@j_rice5 you're welcome, thank you for sharing...",0.398690,Bank/Financial
2091,RT @SierraClub: BIG NEWS: Responding to pressu...,@Citi,RT @SierraClub: BIG NEWS: Responding to pressu...,1.000000,@Citi
860,@tspeakn0w As spots open up on the wait list w...,Bank/Financial,@kylejenks1 Looks like you’re on the wait list...,0.216130,Bank/Financial
274,@Nicole_813 1. Log in to your Bluebird Account...,Bank/Financial,"@KatieAr66709631 Hi Katie, we have resolved th...",0.175671,Bank/Financial
967,@lljillibean This just left is breathless! #Ja...,Bank/Financial,@courtttxoxox This just left us breathless and...,0.603614,Bank/Financial
2962,RT @fairycakes: No one: \n\nArticle about an a...,coffee,RT @fairycakes: No one: \n\nArticle about an a...,1.000000,coffee
2139,"RT @RollingStone: Amy Poehler, Chris Pratt, Ni...",covid-19,"@NantucketGal Thanks, Amy. We will review and ...",0.111420,Bank/Financial
1346,Congratulations to our very own @sekou_kaalund...,Bank/Financial,RT @CarinaGuillard: Congratulations to @Bankof...,0.193900,Bank/Financial


In [26]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["similar_theme"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 88.67%
