In [1]:
import numpy as np
import pandas as pd
import csv
import os
print(os.listdir("."))

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

import warnings
warnings.filterwarnings('ignore')

['.ipynb_checkpoints', 'BankTweets.csv', 'BankTweets_04242020.csv', 'BankTweets_04252020.csv', 'BankTweets_04262020.csv', 'BankTweets_04292020.csv', 'cbikkanur_twitter_credentials.ipynb', 'cbikkanur_twitter_credentials.json', 'CosineSimilarity.ipynb', 'documentSimilarity.ipynb', 'documentSimilarity_TfIdf.ipynb', 'documentSimilarity_TfIdf.ipynb.txt', 'getTwitterData.ipynb', 'getTwitterData_TweePy.ipynb', 'model_save', 'SearchTweets.csv', 'SearchTweets_04242020.csv', 'SearchTweets_04252020.csv', 'SearchTweets_04262020.csv', 'SearchTweets_04292020.csv', 'test_vectors.npy', 'TfIdf_similar_test_tweets.csv', 'train_vectors.npy', 'tweets_04232020.csv', 'tweets_04242020.csv', 'tweets_04252020.csv', 'tweets_04262020.csv', 'tweets_04292020.csv']


In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv', './tweets_04252020.csv', './tweets_04262020.csv', './tweets_04292020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets.head()     

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [3]:
df_tweets.drop_duplicates(["Text", "User_Id"], inplace=True) # remove duplicates in place and reset index
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.shape

(7958, 6)

In [4]:
df = pd.DataFrame()
df['text'] = df_tweets['Text']
df.tail()

Unnamed: 0,text
7953,RT @lucxs__x: You're the coffee that I need in...
7954,RT @RoberteLove: I had to go to the store this...
7955,@mykalmphoto Nice photos. When this is all ov...
7956,RT @minturnalexandr: Ten years ago I bought th...
7957,RT @IvankaTrump: Small businesses like Amy Wri...


In [5]:
total_indices_size = df.shape[0]
train_size = 0.8
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print('Total records: {} \nTrain records: {} \n Test records: {}'.format(total_indices_size, train_indices_size, test_indices_size))

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

Total records: 7958 
Train records: 6366 
 Test records: 1592


In [6]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)

(6366, 1) (1592, 1)


## CountVectorizer & Tf-Idf:

In [7]:
count_vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,3), \
                                   strip_accents='unicode', \
                                   analyzer = 'word',lowercase = True)
tfidf = TfidfTransformer()
pipeline = Pipeline(steps=[('count_vectorizer', count_vectorizer), ('tfidf', tfidf)])

tf_idf_train_matrix = pipeline.fit_transform(train_set["text"])
tf_idf_test_matrix = pipeline.transform(test_set["text"])

In [8]:
print("Train Matrix Shape:", tf_idf_train_matrix.shape)

Train Matrix Shape: (6366, 120508)


In [9]:
def print_similar_texts(query_text, query_text_vector,  train_matrix, train_indices, df_tweets, n = 5):
    cosine_similarities_n = cosine_similarity(query_text_vector, train_matrix).flatten()
    top_Indices = cosine_similarities_n.argsort()[::-1][:n]
    top_tweet_Indices = train_indices[top_Indices]
    
    print('\nInput Text:\n {} \n'.format(query_text))
    for index, sim_text in enumerate(df_tweets.loc[top_tweet_Indices, "Text"]):
        print('*' * 30, 'Similar Text: {} || Similar Score: {}'.format(index+1, np.round(cosine_similarities_n[top_Indices[index]], 3)), '*' * 30, '\n')
        print(sim_text, '\n')

In [10]:
query_index = 5
query_text = df_tweets.loc[test_indices[query_index], "Text"]
query_text_vector = tf_idf_test_matrix[query_index]
print_similar_texts(query_text, query_text_vector, tf_idf_train_matrix, train_indices, df_tweets, n = 5)


Input Text:
 @xoxjessiica I am sorry to hear this. It should appear on the next or following statement. To forward to our internal team on your behalf, DM account/card type only, full name and contact phone number already on profile. No PIN or account numbers. ^Deanna https://t.co/gvC4jAOA9c 

****************************** Similar Text: 1 || Similar Score: 0.387 ****************************** 

@conorFa16113956 We suggest deleting the previous public tweet and DM the full name, and contact phone number only that is already on profile. Email and Twitter are not secure banking communication channels. Do not DM any PIN or account numbers. ^Deanna https://t.co/gvC4jAOA9c 

****************************** Similar Text: 2 || Similar Score: 0.345 ****************************** 

@this_is_Manu Thanks for reaching out. We are here to help. Let us review this further. DM the account/card type only (checking, credit card, retail branded store card, etc) and country where it was open. Do not repl

In [11]:
def find_similar_n(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set_df, df_tweets, n=5):
    similar_texts_list = []
    similar_scores_list = []
    similar_top_score_list = []
    similar_themes_list = []
    similar_top_themes_list = []
    for vector in tf_idf_test_matrix:
        cosine_similarities_n = cosine_similarity(vector, tf_idf_train_matrix).flatten()
        similar_doc_indices =  cosine_similarities_n.argsort()[::-1][:n]
        top_tweet_Indices = train_indices[similar_doc_indices]
        
        similar_texts = [text for text in df_tweets.loc[top_tweet_Indices, "Text"]]
        similar_texts_list.append(similar_texts)
        
        similar_scores = [np.round(score, 3) for score in cosine_similarities_n[similar_doc_indices]]
        similar_scores_list.append(similar_scores)
        similar_top_score_list.append(similar_scores[0])
                                                                   
        similar_themes = [theme for theme in df_tweets.loc[top_tweet_Indices, "Theme"]]
        similar_themes_list.append(similar_themes)
        
        top_similar_theme = max(set(similar_themes), key=lambda x: similar_themes.count(x))
        similar_top_themes_list.append(top_similar_theme)
    
    df = test_set_df 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_texts"] = similar_texts_list
    df["similar_scores"] = similar_scores_list  
    df["top_similar_score"] = similar_top_score_list
    df["similar_themes"] = similar_themes_list
    df["top_similar_themes"] = similar_top_themes_list
    return df

In [12]:
similar_test_df = find_similar_n(tf_idf_train_matrix, train_indices, tf_idf_test_matrix, test_indices, test_set, df_tweets, n=10)
similar_test_df

Unnamed: 0,text,original_theme,similar_texts,similar_scores,top_similar_score,similar_themes,top_similar_themes
6953,"RT @Essence: Join us TOMORROW, 4/27 at 7 PM ES...",Bank/Financial,[Good Night Everyone. We will return promptly ...,"[0.122, 0.116, 0.086, 0.082, 0.077, 0.075, 0.0...",0.122,"[Bank/Financial, Bank/Financial, coffee, car, ...",Bank/Financial
6031,RT @Jordan_Sather_: I can (almost) guarantee y...,nasa,[RT @Jordan_Sather_: I can (almost) guarantee ...,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.043, 0.041, 0.039,...",1.000,"[nasa, nasa, nasa, nasa, nasa, nasa, nasa, nas...",nasa
239,@roth_jroth1515 Hi Jennifer. Do you have a mom...,Bank/Financial,"[@roth_jroth1515 hi Jennifer, sorry to hear th...","[0.391, 0.306, 0.155, 0.149, 0.145, 0.139, 0.1...",0.391,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
5720,RT @gguksluver: jimins face when he realized j...,cake,[RT @gguksluver: jimins face when he realized ...,"[1.0, 1.0, 0.058, 0.055, 0.05, 0.05, 0.049, 0....",1.000,"[cake, cake, cake, cake, cake, cake, cake, cak...",cake
7102,RT @SaralPatel: Shri @RahulGandhi will be inte...,covid-19,[RT @srinivasiyc: Shri @RahulGandhi will be in...,"[0.917, 0.904, 0.904, 0.058, 0.039, 0.035, 0.0...",0.917,"[covid-19, covid-19, covid-19, covid-19, covid...",covid-19
542,@xoxjessiica I am sorry to hear this. It shoul...,Bank/Financial,[@conorFa16113956 We suggest deleting the prev...,"[0.387, 0.345, 0.338, 0.317, 0.292, 0.269, 0.2...",0.387,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
2725,RT @MikeDentale: Airlines sending me “we’re in...,airlines,[RT @MikeDentale: Airlines sending me “we’re i...,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.000,"[airlines, airlines, airlines, airlines, airli...",airlines
2454,@Yo_ItsMONROE EVERY! One. The one where i live...,car,"[Red velvet cake turn out good 😋, Playing Qura...","[0.128, 0.114, 0.109, 0.095, 0.093, 0.084, 0.0...",0.128,"[cake, ramadan, car, Bank/Financial, cake, cak...",car
1459,"@PSCC_Tualatin Hi Jonathan, the applications a...",Bank/Financial,"[@Addvalue2others Hi, the applications are hel...","[0.845, 0.827, 0.759, 0.661, 0.235, 0.191, 0.1...",0.845,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
5764,RT @ZorbitsMath: Extra! Extra! For today’s #Zo...,cake,[RT @RaraMilf: I take extra cream in my coffee...,"[0.147, 0.127, 0.123, 0.113, 0.097, 0.097, 0.0...",0.147,"[coffee, Bank/Financial, Bank/Financial, Bank/...",coffee


In [13]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["top_similar_themes"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 82.1%


In [14]:
similar_test_df.to_csv("TfIdf_similar_test_tweets.csv")