In [1]:
import numpy as np
import pandas as pd
import csv
import os
import random
random.seed(0)
from pprint import pprint  # pretty-printer
from collections import defaultdict # word counter
#print(os.listdir("."))

# sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

# gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

#nltk
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

## Load Data:

In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv', './tweets_04252020.csv', './tweets_04262020.csv', './tweets_04292020.csv', './tweets_04302020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets.head()     

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [3]:
df_tweets.drop_duplicates(["Text"], inplace=True) # remove duplicates in place and reset index
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.shape

(7700, 6)

## Preprocess data: 

In [4]:
def preprocess(df):
    df['text'] = df['text'].str.replace('http\S+', '') # removing URLs
    df['text'] = df['text'].str.replace('[^A-Za-z]+', ' ') # retain only alphanumeric
    df['text'] = df['text'].map(lambda x: x.lower()) # to lower case
    df['text'] = df['text'].map(lambda x: WordNetLemmatizer().lemmatize(x)) # lemmatization   
    df['text'] = df['text'].map(lambda x: word_tokenize(x)) # tokenize words
    df['text'] = df['text'].map(lambda x: [word for word in x if word not in stop_words]) # remove stop words 
    return df

In [5]:
query_index = 0
df = pd.DataFrame()
df['text'] = df_tweets['Text']
print('{:=^100}\n\n {}'.format(' Raw text ', df.text.loc[query_index])) 

df = preprocess(df)
print('\n{:=^100}\n\n {}'.format(' Preprocessed text ', df.text.loc[query_index])) 


 Registering for online access and activating your Citi® card is easy in the Citi Mobile® App. Learn more in the video below. https://t.co/Ec1rIUVX0l


 ['registering', 'online', 'access', 'activating', 'citi', 'card', 'easy', 'citi', 'mobile', 'app', 'learn', 'video']


## Split Data:

In [6]:
total_indices_size = df.shape[0]
train_size = 0.98
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print('Total records: {} \nTrain records: {} \n Test records: {}'.format(total_indices_size, train_indices_size, test_indices_size))

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

Total records: 7700 
Train records: 7546 
 Test records: 154


In [7]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)

(7546, 1) (154, 1)


In [8]:
train_set.loc[0, "text"]

['registering',
 'online',
 'access',
 'activating',
 'citi',
 'card',
 'easy',
 'citi',
 'mobile',
 'app',
 'learn',
 'video']

## Tag Documents:

In [9]:
def tag_documents(df):   
    df_indices = df.index.to_list()      
    for i in range(df.shape[0]):
        yield TaggedDocument(df.loc[df_indices[i], "text"], [df_indices[i]])

In [10]:
train_corpus = list(tag_documents(train_set))
pprint(train_corpus[:1])

[TaggedDocument(words=['penelopeonthego', 'lindseyboylan', 'heerjeet', 'perfect', 'guy', 'go', 'lying', 'dog', 'faced', 'pony', 'soldier', 'guy'], tags=[2241])]


In [11]:
test_corpus = list(tag_documents(test_set))
pprint(test_corpus[:1])

[TaggedDocument(words=['rt', 'blackpinkfml', 'jennie', 'apart', 'protect', 'family', 'abandoned', 'dog', 'adoption', 'campaign', 'since', 'donated', 'alot', 'supplie'], tags=[6242])]


## Doc2Vec:

In [12]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(train_corpus)

In [13]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [14]:
vector = model.infer_vector(['i', 'love', 'espresso', 'coffee'])
print(vector)

[ 0.17554526 -0.06015937 -0.08083551 -0.03107999 -0.16151412 -0.21280283
  0.02225097 -0.00215548 -0.19113825  0.06573348  0.02756668  0.10615117
  0.09008887  0.09060719  0.0042705   0.0968703  -0.10958012  0.10060077
  0.1920451  -0.00855945 -0.01824979  0.10325608 -0.32036293 -0.01775225
  0.06614857 -0.12968439  0.0542037   0.03963283  0.0479433   0.07808828
  0.03196692  0.03435186  0.06628725 -0.03315438  0.11596455  0.05443031
 -0.09314461  0.13202398  0.02717078 -0.08545028 -0.10960717 -0.03497319
 -0.07082701 -0.03742527  0.01837434 -0.03016695 -0.12433695 -0.10806288
 -0.16194136  0.12176057  0.07192167  0.02410932  0.04107241  0.0583943
  0.01048861 -0.11878662 -0.0889902  -0.00433639 -0.04975854 -0.04022327
 -0.10307834 -0.06266695 -0.1612618   0.0697923  -0.17655678  0.05534594
  0.04645586  0.08291026 -0.02102945 -0.08578876 -0.08884513 -0.02174282
  0.0674123   0.12275301  0.06646194 -0.13852721 -0.0189216   0.1968533
 -0.19540337  0.03114284 -0.01444465 -0.03642225  0.0

## Infer Vectors:

In [15]:
train_corpus_infer_list = [] 
for i in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[i].words)
    train_corpus_infer_list.append(inferred_vector)
    
train_corpus_matrix = np.matrix(train_corpus_infer_list) 
train_corpus_matrix.shape

(7546, 100)

In [16]:
test_corpus_infer_list = [] 
for i in range(len(test_corpus)):
    inferred_vector = model.infer_vector(test_corpus[i].words)
    test_corpus_infer_list.append(inferred_vector)
    
test_corpus_matrix = np.matrix(test_corpus_infer_list) 
test_corpus_matrix.shape

(154, 100)

## Find Similar Comments:

In [17]:
def print_similar_texts(query_text, query_text_vector,  train_matrix, train_indices, df_tweets, n = 5):
    cosine_similarities_n = cosine_similarity(query_text_vector, train_matrix).flatten()
    top_Indices = cosine_similarities_n.argsort()[::-1][:n]
    top_tweet_Indices = train_indices[top_Indices]
    
    print('\nInput Text:\n {} \n'.format(query_text))
    for index, sim_text in enumerate(df_tweets.loc[top_tweet_Indices, "Text"]):
        print('=' * 30, 'Similar Text: {} || Similar Score: {}'.format(index+1, np.round(cosine_similarities_n[top_Indices[index]], 3)), '=' * 30, '\n')
        print(sim_text, '\n')

In [18]:
query_index = 33
query_text = df_tweets.loc[test_indices[query_index], "Text"]
query_text_vector = test_corpus_matrix[query_index]
print_similar_texts(query_text, query_text_vector, train_corpus_matrix, train_indices, df_tweets, n = 5)


Input Text:
 @81diosab I am sorry for the delayed response. Due to COVID-19 causing higher volume, clients are experiencing longer wait times worldwide. Were you able to reach Customer Service? If not, DM the account/card type only. No PIN or account numbers. ^Deanna https://t.co/gvC4jB6b0K 


@On2Whls Let us review this further with you. We're sorry for the delayed response. Due to higher volumes during COVID-19, clients are experiencing longer wait times worldwide. DM the full name and type of account/card only. No PIN or account numbers. ^Deanna https://t.co/gvC4jAOA9c 


@coolmaneesh Thanks for your patience, and I am sorry for the delay. Please DM your account type. No account numbers in the reply please. ^Brian https://t.co/gvC4jAOA9c 


@JinWang34339222 Hi. Please check your DM. ^Ana 


@FlyinGaurav Due to COVID-19 impact causing higher volumes worldwide, please DM the concerns, account/card type only, and the country where it was open. Do not reply with any PIN or account numb

## Quantify the goodnes of similarity:

In [19]:
def find_similar_n(train_matrix, train_indices, test_matrix, test_indices, test_set_df, df_tweets, n=5):
    similar_texts_list = []
    similar_scores_list = []
    similar_top_score_list = []
    similar_themes_list = []
    similar_top_themes_list = []
    for vector in test_matrix:
        cosine_similarities_n = cosine_similarity(vector, train_matrix).flatten()
        similar_doc_indices =  cosine_similarities_n.argsort()[::-1][:n]
        top_tweet_Indices = train_indices[similar_doc_indices]
        
        similar_texts = [text for text in df_tweets.loc[top_tweet_Indices, "Text"]]
        similar_texts_list.append(similar_texts)
        
        similar_scores = [np.round(score, 3) for score in cosine_similarities_n[similar_doc_indices]]
        similar_scores_list.append(similar_scores)
        similar_top_score_list.append(similar_scores[0])
                                                                   
        similar_themes = [theme for theme in df_tweets.loc[top_tweet_Indices, "Theme"]]
        similar_themes_list.append(similar_themes)
        
        top_similar_theme = max(set(similar_themes), key=lambda x: similar_themes.count(x))
        similar_top_themes_list.append(top_similar_theme)
    
    df = test_set_df.copy() 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_texts"] = similar_texts_list
    df["similar_scores"] = similar_scores_list  
    df["top_similar_score"] = similar_top_score_list
    df["similar_themes"] = similar_themes_list
    df["top_similar_themes"] = similar_top_themes_list
    return df

In [20]:
similar_test_df = find_similar_n(train_corpus_matrix, train_indices, test_corpus_matrix, test_indices, test_set, df_tweets, n=20)
similar_test_df.head()

Unnamed: 0,text,original_theme,similar_texts,similar_scores,top_similar_score,similar_themes,top_similar_themes
6242,"[rt, blackpinkfml, jennie, apart, protect, fam...",dog,[@pulte In need of help with Credit Card debt ...,"[0.6, 0.552, 0.54, 0.539, 0.531, 0.529, 0.524,...",0.6,"[dog, dog, dog, dog, dog, dog, cake, dog, dog,...",dog
6521,"[ericawerner, feliciasonmez, long, term, care,...",airlines,[@MichaelRapaport @NYCMayor Don’t forget about...,"[0.677, 0.667, 0.664, 0.658, 0.636, 0.632, 0.6...",0.677,"[airlines, airlines, airlines, airlines, airli...",airlines
4226,"[wsnt, car, back, damnit, wan, na, listen, new...",car,[RT @mitchellvii: I'm willing to take my chanc...,"[0.509, 0.487, 0.484, 0.478, 0.455, 0.453, 0.4...",0.509,"[covid-19, dog, car, coffee, car, Bank/Financi...",car
5819,"[hijonramirez, hello, thank, reaching, better,...",Bank/Financial,"[@JoseJrq777 Hi Jose, thank you for reaching o...","[0.945, 0.942, 0.936, 0.927, 0.924, 0.922, 0.9...",0.945,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
6167,"[rt, houseofbakes, th, raspberry, chocolate, m...",cake,"[RT @LexiTriplet: For my 11th birthday, those ...","[0.82, 0.815, 0.815, 0.809, 0.798, 0.786, 0.78...",0.82,"[cake, cake, cake, ramadan, Bank/Financial, ca...",cake


In [21]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["top_similar_themes"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 77.27%


In [22]:
similar_test_df.to_csv("Doc2Vec_v1_similar_test_tweets.csv")