In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords


In [3]:
df = pd.read_csv("emails.csv", index_col="Unnamed: 0")
#df.info()
Number_of_Topics = df.shape[0]-1

In [4]:
tfidf_text_vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words=stopwords, min_df=1, max_df=0.9)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['text'])

  % sorted(inconsistent)


# NMF

In [5]:
from sklearn.decomposition import NMF

nmf_text_model = NMF(n_components=Number_of_Topics, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_



In [6]:
def display_topics(model, features, no_top_words=3):
    topics_list = []
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        str2 = "Topic %02d" % topic
        #print("\nTopic %02d" % topic)
        #print("\n", str2)
        str1 = ""
        for i in range(0, no_top_words):
            str2 = str("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))
            print(str2)
            str1 = str1 + str2
        topics_list.append(str1)
    return topics_list

In [7]:
#tfidf_text_vectorizer.get_feature_names()
all_topics = display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names())

  activity (3.42)
  access (2.59)
  google (2.44)
  read (1.48)
  tech (1.06)
  min read (1.06)
  taj (1.85)
  brand (0.93)
  hotel (0.93)
  travel (1.38)
  tripadvisor (1.26)
  travellers (1.16)
  google (3.68)
  account (1.40)
  new account (1.35)
  fund (1.75)
  nfo (1.73)
  taiwan (1.48)
  wazirx (3.66)
  trading (1.99)
  zero (1.99)
  data (1.49)
  min (1.43)
  data science (0.67)
  citibank (2.20)
  help bank (1.57)
  bank (1.39)
  novotel (4.17)
  novotel goa (3.98)
  dona (3.78)
  cloud (3.01)
  free (2.96)
  google cloud (2.87)
  weta (0.59)
  unity (0.55)
  techcrunch (0.42)
  rs (1.23)
  price (0.74)
  et (0.61)
  app (0.98)
  live (0.98)
  markets (0.98)
  hdfc (3.15)
  hdfc mutual (2.29)
  mutual (2.03)
  news (3.84)
  sources (2.08)
  google news (1.56)
  destinations (1.14)
  experiences (1.14)
  taj (1.05)
  innercircle (1.76)
  taj (1.59)
  taj innercircle (1.54)
  changes (2.87)
  changes review (2.22)
  suspect locked (2.22)
  access (3.27)
  google (2.70)
  apps (2.



In [8]:
# How big are the topics. ie How many documents could be assigned to each topic
# this is normalized
W_text_matrix.sum(axis=0)/W_text_matrix.sum()*100.0

array([6.11729751, 3.18550487, 3.01559608, 4.6819436 , 2.18977506,
       2.91200478, 4.07438733, 3.87301655, 2.5267187 , 4.34278666,
       3.9977564 , 4.8795175 , 4.61359092, 4.6786314 , 5.13978605,
       4.45422522, 4.96702775, 5.51668049, 5.32331304, 4.56935137,
       4.43401928, 5.32212482, 5.18494461])

In [9]:
# add topic to dataframe
i = 0
df["NMF_topic"] = ""
for r in W_text_matrix:
    loc = np.where(r == np.amax(r))[0][0]
    df.iat[i, df.columns.get_loc('NMF_topic')]=all_topics[loc]
    i+=1

# TruncatedSVD

In [10]:
from sklearn.decomposition import TruncatedSVD

svd_text_model = TruncatedSVD(n_components = Number_of_Topics, random_state=42)
W_svd_text_matrix = svd_text_model.fit_transform(tfidf_text_vectors)
H_svd_text_matrix = svd_text_model.components_

In [11]:
all_topics = display_topics(svd_text_model, tfidf_text_vectorizer.get_feature_names())

  google (2.05)
  access (1.20)
  activity (1.20)
  read (0.58)
  min (0.45)
  min read (0.38)
  taj (1.14)
  tripadvisor (1.02)
  restaurants (0.54)
  tripadvisor (15.04)
  travellers (7.80)
  travel (6.58)
  google (1.12)
  news (0.79)
  cloud (0.58)
  google (5.37)
  news (3.18)
  cloud (2.81)
  wazirx (25.51)
  trading (16.32)
  zero (14.12)
  data (0.83)
  min (0.50)
  weta (0.49)
  novotel (4.18)
  novotel goa (3.98)
  goa dona (3.78)
  novotel (6.87)
  novotel goa (6.54)
  goa dona (6.21)
  wazirx (8.49)
  data (5.52)
  citibank (5.15)
  weta (9.14)
  hdfc (8.86)
  unity (8.48)
  cloud (4.16)
  free (4.07)
  google cloud (3.98)
  free (6.99)
  cloud (6.96)
  google cloud (6.80)
  hdfc (28.52)
  hdfc mutual (20.74)
  mutual (16.43)
  apps (22.89)
  google (18.25)
  account (14.45)
  experiences (6.06)
  destinations (6.06)
  designed (5.19)
  innercircle (4.56)
  taj innercircle (3.99)
  15 savings (2.85)
  accountyou (17.27)
  review protect (17.21)
  accountif (17.21)
  access 

In [12]:
# add topic to dataframe
i = 0
df["SVD_topic"] = ""
for r in W_svd_text_matrix:
    loc = np.where(r == np.amax(r))[0][0]
    df.iat[i, df.columns.get_loc('SVD_topic')]=all_topics[loc]
    i+=1

# LDA

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_text_vectorizer = CountVectorizer(ngram_range=(1,2), stop_words=stopwords, min_df=1, max_df=0.9)
count_text_vectors = count_text_vectorizer.fit_transform(df["text"])

  % sorted(inconsistent)


In [14]:
from sklearn.decomposition import LatentDirichletAllocation

lda_text_model = LatentDirichletAllocation(n_components = Number_of_Topics, random_state=42)
W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

In [15]:
all_topics = display_topics(lda_text_model, count_text_vectorizer.get_feature_names())

  free (2.61)
  cloud (2.07)
  google (1.96)
  taj (0.83)
  destinations (0.73)
  experiences (0.73)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  travel (1.10)
  tripadvisor (1.06)
  travellers (1.01)
  2021 (1.25)
  wazirx (1.25)
  november (0.77)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  followfull (0.34)
  newsgooglecomwelcome google (0.34)
  communities followfull (0.01)
  read (1.66)
  tech (0.99)
  min read (0.90)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  taj (1.52)
  hotel (0.76)
  brand (0.76)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  zone medium (0.01)
  finish google (0.01)
  find answers (0.01)
  google (2.53)
  account (1.36)
  access (1.33)
  novotel (2.48)
  goa (2.36)
  novotel goa (2.36)
  tripadvisor (1.16)
  hdfc (0.95)
  mutual (0.69)
  rs (1.00)
  price (0.60)
  buy (0.60)
  app (0.67)
  live (0.53)
  

In [16]:
# add topic to dataframe
i = 0
df["LDA_topic"] = ""
for r in W_lda_text_matrix:
    loc = np.where(r == np.amax(r))[0][0]
    df.iat[i, df.columns.get_loc('LDA_topic')]=all_topics[loc]
    i+=1

In [17]:
df.to_csv("email_topics.csv")

