In [12]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity_TfIdf.ipynb', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'SearchTweets.csv', 'tweets_04232020.csv']


In [5]:
df_tweets = pd.read_csv('./tweets_04232020.csv', sep=',',header=0, encoding='utf-8', index_col = 0)
df_tweets.head()

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [6]:
df = pd.DataFrame()
df['text'] = df_tweets['Text']
df.tail()

Unnamed: 0,text
2995,RT @fairycakes: No one: \n\nArticle about an a...
2996,RT @Mom_Overboard: Mom culture is putting a co...
2997,RT @soxloverhk: Coffee break https://t.co/kamo...
2998,Next is decaf you brew it just like coffee.
2999,coffee tastes so good at night🖤


In [25]:
total_indices_size = df.shape[0]
train_size = 0.8
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print(total_indices_size, train_indices_size, test_indices_size)

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

3000 2400 600


In [26]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)
train_set[:5]

(2400, 1) (600, 1)


Unnamed: 0,text
2193,RT @ksadjadpour: A day in the life of Donald T...
745,Here’s how to contact us.\nStudent loans: http...
1128,"@stilldevelopin5 hi, connecting you to our cus..."
1337,"For Noelle Santos, founder of @thelitbar, open..."
2669,RT @inoueart: Inoue Takehiko drew a painting o...


## CountVectorizer & Tf-Idf:

In [27]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word',lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [28]:
print(tf_idf_train_matrix.toarray())
print("Vector Shape:", tf_idf_test_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Vector Shape: (600, 50612)


In [29]:
# Sanity check
cosine_similarities = cosine_similarity(tf_idf_test_matrix[2], tf_idf_train_matrix).flatten()
top_Indices = cosine_similarities.argsort()[::-1][:1].item(0)
top_tweet_Indices = train_indices[top_Indices]

print("\nInput Text:\n", df_tweets.loc[test_indices[2], "Text"], "\n")
print("\nSimilar Text:\n", df_tweets.loc[top_tweet_Indices, "Text"], "\n")


Input Text:
 Commercial cards have a crucial role to play in the digitization of the B2B payment experience. Learn more: https://t.co/L1HVpy19P5 https://t.co/yySulJIXkT 


Similar Text:
 #Citi Commercial Cards has launched an intelligent virtual agent capability within its call centers in the U.S., leveraging technology from @interactionsco, a leading conversational AI company. Learn more: https://t.co/CvRMEeN342 



In [30]:
def find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets):
    similar_text_list = []
    similar_score_list = []
    similar_theme_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_index =  cosine_similarities.argsort()[::-1][:1].item(0)
        top_tweet_Index = train_indices[similar_doc_index]
        similar_text_list.append(df_tweets.loc[top_tweet_Index, "Text"])
        similar_score_list.append(cosine_similarities[similar_doc_index])
        similar_theme_list.append(df_tweets.loc[top_tweet_Index, "Theme"])    
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_text"] = similar_text_list
    df["similar_score"] = similar_score_list     
    df["similar_theme"] = similar_theme_list
    return df

In [31]:
similar_test_df = find_similar(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets)
similar_test_df

Unnamed: 0,text,original_theme,similar_text,similar_score,similar_theme
2590,RT @notsoevilrick: NASA faked the cow jumping ...,nasa,"Please NASA just blast me to the moon, it's th...",0.262069,nasa
1208,@jwgvico @ChaseforBiz We'll be sending a follo...,Bank/Financial,@rogerdalal @ChaseforBiz We understand. We'll ...,0.589371,Bank/Financial
1912,Commercial cards have a crucial role to play i...,Bank/Financial,#Citi Commercial Cards has launched an intelli...,0.174710,Bank/Financial
2311,@MySportsUpdate raiders fans got a DOG!!,dog,@caitiedelaney I can’t believe I didn’t know y...,0.185222,dog
1266,@butch17372553 We’re updating the site to make...,Bank/Financial,@Meagan03730911 We’re updating the site to mak...,0.894816,Bank/Financial
1044,@LynneSmithGA Connecting you with @AskAmex to ...,Bank/Financial,@FrancoB411 Connecting you with @AskAmex to he...,0.771474,Bank/Financial
1159,@duris Please give us a call using the number ...,Bank/Financial,@techsmarterusa Please call the number on the ...,0.739709,Bank/Financial
386,"@CampPantsz No, I'm sorry we don't. ^B",Bank/Financial,@CampPantsz Please try your call again at a la...,0.233068,Bank/Financial
2091,RT @SierraClub: BIG NEWS: Responding to pressu...,@Citi,RT @SierraClub: BIG NEWS: Responding to pressu...,1.000000,@Citi
23,Healthcare workers on the frontline fighting t...,Bank/Financial,Healthcare workers on the frontline fighting t...,0.930016,Bank/Financial


In [32]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["similar_theme"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 87.17%
