In [1]:
import numpy as np
import pandas as pd
import csv
import os
from pprint import pprint  # pretty-printer
from collections import defaultdict # word counter
#print(os.listdir("."))

# sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
np.random.seed(0)

# gensim
from gensim import corpora
from gensim import models
from gensim import similarities

#nltk
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

In [2]:
all_files = ['./tweets_04232020.csv', './tweets_04242020.csv', './tweets_04252020.csv', './tweets_04262020.csv', './tweets_04292020.csv', './tweets_04302020.csv']

df_list = []
for filename in all_files:
    df_list.append(pd.read_csv(filename, sep=',',header=0, encoding='utf-8', index_col = 0))
    
df_tweets = pd.concat(df_list, ignore_index=True)
df_tweets.head()     

Unnamed: 0,Tweet_Id,User_Id,User_Name,User_Screen_Name,Theme,Text
0,1253375593271394312,80374332,Citibank,Citibank,Bank/Financial,Registering for online access and activating y...
1,1253322747851296768,80374332,Citibank,Citibank,Bank/Financial,Protect your CARES Act payments: Validate comm...
2,1253066045700681731,80374332,Citibank,Citibank,Bank/Financial,It’s simple to set up a payment account in the...
3,1252960358454767616,80374332,Citibank,Citibank,Bank/Financial,We’re committed to helping provide the support...
4,1252673469852143618,80374332,Citibank,Citibank,Bank/Financial,Protect Yourself from COVID-19 Scams: Don’t re...


In [3]:
df_tweets.drop_duplicates(["Text"], inplace=True) # remove duplicates in place and reset index
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.shape

(7700, 6)

## Preprocess data: 

In [4]:
def preprocess(df):
    df['text'] = df['text'].str.replace('http\S+', '') # removing URLs
    df['text'] = df['text'].str.replace('[^A-Za-z]+', ' ') # retain only alphanumeric
    df['text'] = df['text'].map(lambda x: x.lower()) # to lower case
    df['text'] = df['text'].map(lambda x: WordNetLemmatizer().lemmatize(x)) # lemmatization   
    df['text'] = df['text'].map(lambda x: word_tokenize(x)) # tokenize words
    df['text'] = df['text'].map(lambda x: [word for word in x if word not in stop_words]) # remove stop words 
    return df

In [5]:
query_index = 0
df = pd.DataFrame()
df['text'] = df_tweets['Text']
print('{:=^100}\n\n {}'.format(' Raw text ', df.text.loc[query_index])) 

df = preprocess(df)
print('\n{:=^100}\n\n {}'.format(' Preprocessed text ', df.text.loc[query_index])) 


 Registering for online access and activating your Citi® card is easy in the Citi Mobile® App. Learn more in the video below. https://t.co/Ec1rIUVX0l


 ['registering', 'online', 'access', 'activating', 'citi', 'card', 'easy', 'citi', 'mobile', 'app', 'learn', 'video']


## Split Data:

In [6]:
total_indices_size = df.shape[0]
train_size = 0.98
train_indices_size = int(train_size * total_indices_size)
test_indices_size = total_indices_size - train_indices_size
print('Total records: {} \nTrain records: {} \n Test records: {}'.format(total_indices_size, train_indices_size, test_indices_size))

total_indices_array = np.array([x for x in range(total_indices_size)])
np.random.shuffle(total_indices_array)
train_indices, test_indices = total_indices_array[:train_indices_size], total_indices_array[train_indices_size:]

Total records: 7700 
Train records: 7546 
 Test records: 154


In [7]:
train_set, test_set = df.loc[train_indices], df.loc[test_indices]
print(train_set.shape, test_set.shape)

(7546, 1) (154, 1)


## Dictionary & Corpus:

In [8]:
dictionary = corpora.Dictionary(train_set['text'])
dictionary.save('./tweets.dict')  
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in train_set['text']]
corpora.MmCorpus.serialize('./tweets_corpus.mm', corpus)

Dictionary(19457 unique tokens: ['dog', 'faced', 'go', 'guy', 'heerjeet']...)


## Latent Semantic Indexing:

In [9]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)

In [10]:
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it

In [11]:
index.save('./tweets.index')
index = similarities.MatrixSimilarity.load('./tweets.index')

In [12]:
# sanity check
n = 5
doc = "covid-19 pandemic"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
sims = index[vec_lsi]  # perform a similarity query against the corpus
top_Indices = sims.argsort()[::-1][:n]
top_tweet_Indices = train_indices[top_Indices]

print('\nInput Text:\n {} \n'.format(doc))
for i, sim_text in enumerate(df_tweets.loc[top_tweet_Indices, "Text"]):
        print('=' * 30, 'Similar Text: {} || Similar Score: {}'.format(i+1, np.round(sims[top_Indices[i]], 3)), '=' * 30, '\n')
        print(sim_text, '\n')


Input Text:
 covid-19 pandemic 


This #EarthDay feels different than others. We examine the other ways the pandemic has taken a toll on nature despite lower carbon emissions.  https://t.co/KkzyjIakoI 


@hopd87 @Citibank The pandemic has challenged us all. We're trying our best to look out for clients and our employees. I appreciate your patience w/ our delayed response. Send a DM us with your account type/brand and the country it was opened. Never send acct numbers or PINs. ^Mike 


@FK1tten The pandemic has challenged us all. We're trying our best to look out for clients and our employees. I appreciate your patience w/ our delayed response. Send a DM us with your account type/brand and the country it was opened. Never send acct numbers or PINs. ^Mike 


@Steve_Font The pandemic has challenged us all. We're trying our best to look out for clients and our employees. I appreciate your patience w/ our delayed response. Send a DM us with your account type/brand and the country it was op

In [13]:
def print_similar_texts(query_text, query_text_vector,  index, train_indices, df_tweets, n = 5):
    sims = index[query_text_vector]  # perform a similarity query against the corpus
    top_Indices = sims.argsort()[::-1][:n]
    top_tweet_Indices = train_indices[top_Indices]

    print('\nInput Text:\n {} \n'.format(query_text))
    for index, sim_text in enumerate(df_tweets.loc[top_tweet_Indices, "Text"]):
        print('=' * 30, 'Similar Text: {} || Similar Score: {}'.format(index+1, np.round(sims[top_Indices[index]], 3)), '=' * 30, '\n')
        print(sim_text, '\n')

In [14]:
query_index = 27
query_text = df_tweets.loc[test_indices[query_index], "Text"]
query_text_list = df.loc[test_indices[query_index], "text"]
query_vec_bow = dictionary.doc2bow(query_text_list)
query_text_vector = lsi[query_vec_bow] 
print_similar_texts(query_text, query_text_vector,  index, train_indices, df_tweets, n = 5)


Input Text:
 Listened to the news in my car and sports news lasted 10 seconds. 🤣 


Jennifer Aniston shared the quarantine parody of the cute Friends theme song | Instant News https://t.co/geFZUQ9y0W https://t.co/05MlyEoI6h 


@helvo6 The big car behind you🛣 


Update he just put a car in neutral and drove it down the garage 


I always wonder why my old car gave up on my ass, but than makayla reminds me how I use to dick it on a daily 😂😂😂 https://t.co/uHl6vYLmW7 


NO MORE bad news .. not on here or in DM’s or texts emails or phone. Not by FedX car bus train horse &amp; buggy or a sign in my yard. Not by TV newspaper Instagram FB Facetime radio or drones. In short don’t relay ANY bad news to ME.  BILLIONS of good things out there. FIND ONE 



In [15]:
def find_similar_n(train_index_lsi, train_indices, test_set, test_indices, test_set_df, df_tweets, n=5):
    similar_texts_list = []
    similar_scores_list = []
    similar_top_score_list = []
    similar_themes_list = []
    similar_top_themes_list = []
    for vector in test_set.text:
        #print(vector)
        query_vec_bow = dictionary.doc2bow(vector)
        sims = train_index_lsi[lsi[query_vec_bow]]  # perform a similarity query against the corpus
        top_Indices = sims.argsort()[::-1][:n]
        top_tweet_Indices = train_indices[top_Indices]
                
        similar_texts = [text for text in df_tweets.loc[top_tweet_Indices, "Text"]]
        similar_texts_list.append(similar_texts)
        
        similar_scores = [np.round(score, 3) for score in sims[top_Indices]]
        similar_scores_list.append(similar_scores)
        similar_top_score_list.append(similar_scores[0])
                                                                   
        similar_themes = [theme for theme in df_tweets.loc[top_tweet_Indices, "Theme"]]
        similar_themes_list.append(similar_themes)
        
        top_similar_theme = max(set(similar_themes), key=lambda x: similar_themes.count(x))
        similar_top_themes_list.append(top_similar_theme)
    
    df = test_set_df.copy() 
    df["original_theme"] = df_tweets.loc[test_indices, "Theme"]
    df["similar_texts"] = similar_texts_list
    df["similar_scores"] = similar_scores_list  
    df["top_similar_score"] = similar_top_score_list
    df["similar_themes"] = similar_themes_list
    df["top_similar_themes"] = similar_top_themes_list
    return df

In [16]:
similar_test_df = find_similar_n(index, train_indices, test_set, test_indices, test_set, df_tweets, n=20)
similar_test_df.head()

Unnamed: 0,text,original_theme,similar_texts,similar_scores,top_similar_score,similar_themes,top_similar_themes
6242,"[rt, blackpinkfml, jennie, apart, protect, fam...",dog,[RT @jory_c505: Wienerschnitzel makes a fine c...,"[0.986, 0.986, 0.985, 0.985, 0.985, 0.985, 0.9...",0.986,"[dog, dog, dog, dog, dog, dog, dog, dog, dog, ...",dog
6521,"[ericawerner, feliciasonmez, long, term, care,...",airlines,[Now playing Low Cost Airlines by ! https://t...,"[0.84, 0.838, 0.837, 0.837, 0.836, 0.836, 0.83...",0.84,"[airlines, airlines, airlines, airlines, airli...",airlines
4226,"[wsnt, car, back, damnit, wan, na, listen, new...",car,"[new car seat headrest is so gooooood, *TOYOTA...","[0.789, 0.775, 0.743, 0.734, 0.726, 0.721, 0.7...",0.789,"[car, car, car, car, car, car, car, car, car, ...",car
5819,"[hijonramirez, hello, thank, reaching, better,...",Bank/Financial,"[@JoseJrq777 Hi Jose, thank you for reaching o...","[0.961, 0.956, 0.952, 0.951, 0.94, 0.93, 0.925...",0.961,"[Bank/Financial, Bank/Financial, Bank/Financia...",Bank/Financial
6167,"[rt, houseofbakes, th, raspberry, chocolate, m...",cake,"[RT @LexiTriplet: For my 11th birthday, those ...","[0.987, 0.98, 0.977, 0.977, 0.975, 0.974, 0.97...",0.987,"[cake, cake, cake, cake, cake, cake, cake, cak...",cake


In [17]:
accuracy = 100 * np.sum(similar_test_df["original_theme"] == similar_test_df["top_similar_themes"])/similar_test_df.shape[0]
print("Accuracy on test data in predicting theme: {}%".format(np.round(accuracy,2)))

Accuracy on test data in predicting theme: 87.01%


In [18]:
similar_test_df.to_csv("LSI_v1_similar_test_tweets.csv")