In [2]:
import pandas as pd
import numpy as np
import gzip, shutil
import io
import time

In [None]:
# Enron Emails:
# orig source: www.cs.cmu.edu/~enron
# retrieved from https://archive.ics.uci.edu/ml/datasets/bag+of+words

# D=39861
# W=28102
# N=6,400,000 (approx)

# D is the number of documents, W is the
# number of words in the vocabulary, and N is the total number of words
# in the collection (below, NNZ is the number of nonzero counts in the
# bag-of-words).

# The format of the docword.*.txt file is 3 header lines, followed by 
# NNZ triples: 
# --- 
# D 
# W 
# NNZ 
# docID wordID count 
# docID wordID count 
# docID wordID count 
# docID wordID count 


In [21]:

#import the docword file skipping the first 3 rows that contain the following metadata:
# D 39861
# W 28102
# NNZ 3710420



with gzip.open('docword.enron.txt.gz') as f:

    features_train = pd.read_csv(f, header=None, skiprows=3, sep=' ')

#label the columns
features_train.columns = ['docID','wordID','count']
features_train.head()

features_train.shape



(3710420, 3)

In [57]:
### Next step is to deal with the .txt file
# The format of the vocab.*.txt file is line contains wordID=n.
vocab = pd.read_csv('vocab.enron.txt', header=None)
vocab.index = np.arange(1, len(vocab) + 1)
# vocab.columns = ['wordID','word']
vocab.head()


Unnamed: 0,0
1,aaa
2,aaas
3,aactive
4,aadvantage
5,aaker


In [58]:
# Start Here
# Join word list back to sparse matrix
merged = pd.merge(features_train, vocab, how='left', left_on='wordID', right_index=True)
merged.columns=['docID','wordID','count','word']
merged.head()


Unnamed: 0,docID,wordID,count,word
0,1,118,1,access
1,1,285,1,additional
2,1,1229,1,april
3,1,1688,1,authorize
4,1,2068,1,basis


In [40]:
# Use this to test smaller subsets of the merged data
mergedTest = merged.loc[merged['docID'] <10]
    

mergedTest.head()


Unnamed: 0,docID,wordID,count,word
0,1,118,1,access
1,1,285,1,additional
2,1,1229,1,april
3,1,1688,1,authorize
4,1,2068,1,basis


In [62]:
# Add trailing whitespace and then create new column that fills out repeated words from the original doc
DF = merged
DF["word"]= DF["word"]+" "
DF["W_array"]= DF["word"] * DF["count"]
DF.head(50)

Unnamed: 0,docID,wordID,count,word,W_array
0,1,118,1,access,access
1,1,285,1,additional,additional
2,1,1229,1,april,april
3,1,1688,1,authorize,authorize
4,1,2068,1,basis,basis
5,1,5299,1,contract,contract
6,1,6941,1,discharge,discharge
7,1,7223,1,doesnt,doesnt
8,1,8904,1,expected,expected
9,1,9358,1,fewer,fewer


In [45]:
# Get start time for the loop
start = time.time()

# Loop to create list of documents to feed into topic models / only append list for uniques

documents=[]
for i in DF['docID']:
    try:
        doc = "".join(DF["W_array"].loc[DF['docID']==i])
        if doc not in documents:
            documents.append(doc)
    except:
        continue
        
# Time taken
elapTime = (time.time() - start)

print("Loop took " + str(elapTime) + " seconds.")
print("Full dataset will take approximately " + str(((len(merged)*(elapTime/len(DF)))/60)) + " minutes.")

documents


Loop took 19959.538918972015 seconds.
Full dataset will take approximately 332.6589819828669 minutes.


['access additional april authorize basis contract discharge doesnt expected fewer flow forecast half holtz imagination insight keith lower lowered note outflow panned power range site southern_california stayed trader website west ',
 'able adjustment agreement alternatives appraisal appraisal appraisal areas austin balance based basis believe believe builder building class closer comfortable comment common comp component concern concern conference cost cost cost cost cost cost cover discuss drive estimates estimates expert feel financing financing financing focus follow foot formas gathered high housing houston increase interim interim keith land larger level location marcos market market nature occupancy office optimistic overly performing permanent permanent potentially price pro proceed produce proforma project project project project project properties property rates ready reagan real_estate recognize rent rental rental research reviewed reviewed sales seasonal selling significan

In [59]:
len(documents)

36782

In [46]:
#Import feature_extraction vectorizers from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000


# NMF is able to use tf-idf

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(documents)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()



# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

tf = tf_vectorizer.fit_transform(documents)

tf_feature_names = tf_vectorizer.get_feature_names()

In [50]:
# Import NMF and LDA functions from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF

nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA

lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [61]:
def display_topics(model, feature_names, no_top_words):

    for topic_idx, topic in enumerate(model.components_):

        print("Topic %d: " % (topic_idx))

        print(" ".join([feature_names[i]

                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

print("NMF topics:")
display_topics(nmf, tfidf_feature_names, no_top_words)
print(" ")
print("LDA topics:")
display_topics(lda, tf_feature_names, no_top_words)

NMF topics:
Topic 0: 
going think ill thing hope game night guy weekend look fax texas phone jeff sure friday dinner thought team talk
Topic 1: 
power energy california electricity davis billion utilities states governor utility company californias consumer plant percent committee edison generator power_plant public
Topic 2: 
meeting attend scheduled discuss monday conference agenda thursday tuesday office wednesday friday committee lynn tomorrow date plan room held number
Topic 3: 
attached file comment draft letter document review final copy version agreement list send michelle received summary request revised memo presentation
Topic 4: 
click free offer link online gift receive special web list site holiday save visit order send page card travel account
Topic 5: 
contract message corp prohibited strictly andor party copies material property affiliate review evidence reply receive create basis offer received error
Topic 6: 
company business group trading services deal team project ma

In [210]:
DLseries = pd.Series(doclist)
type(DLseries)

pandas.core.series.Series

In [211]:
DLseriesDup = DLseries.drop_duplicates()
documents = DLseriesDup.tolist()
documents

['access additional april authorize basis contract discharge doesnt expected fewer flow forecast half holtz imagination insight keith lower lowered note outflow panned power range site southern_california stayed trader website west ',
 'able adjustment agreement alternatives appraisal appraisal appraisal areas austin balance based basis believe believe builder building class closer comfortable comment common comp component concern concern conference cost cost cost cost cost cost cover discuss drive estimates estimates expert feel financing financing financing focus follow foot formas gathered high housing houston increase interim interim keith land larger level location marcos market market nature occupancy office optimistic overly performing permanent permanent potentially price pro proceed produce proforma project project project project project properties property rates ready reagan real_estate recognize rent rental rental research reviewed reviewed sales seasonal selling significan

In [81]:

# Add trailing whitespace and then create new column that fills out repeated words from the original doc
# mergedTest["word"]= mergedTest["word"]+" "
# mergedTest["W_array"]= mergedTest["word"] * mergedTest["count"]
mergedTest.head()


Unnamed: 0,docID,wordID,count,word,W_array,doc_array
409,9,280,1,addictive,addictive,addictive anytime arcade arcade classic classi...
410,9,1116,1,anytime,anytime,addictive anytime arcade arcade classic classi...
411,9,1267,2,arcade,arcade arcade,addictive anytime arcade arcade classic classi...
412,9,4402,2,classic,classic classic,addictive anytime arcade arcade classic classi...
413,9,8818,1,exchange,exchange,addictive anytime arcade arcade classic classi...


In [78]:
# Create an array that joins all of the words
mergedTest["doc_array"]="".join(mergedTest["W_array"])
mergedTest["doc_array"][:1]

409    addictive anytime arcade arcade classic classi...
Name: doc_array, dtype: object

In [100]:
# doclev = pd.DataFrame(mergedTest.groupby(by=['docID'])['doc_array'].mode() ).reset_index()
doclev = mergedTest.groupby('docID')['doc_array'].first()
type(doclev)

pandas.core.series.Series

In [17]:
#Enron data from kaggle https://www.kaggle.com/jaykrishna/topic-modeling-enron-email-dataset/data


enronemail = pd.read_csv("C:/Users/Ben/Desktop/UNH Classes/DATA 900/enron/emails.csv")

In [19]:
enronemail.message

0         Message-ID: <18782981.1075855378110.JavaMail.e...
1         Message-ID: <15464986.1075855378456.JavaMail.e...
2         Message-ID: <24216240.1075855687451.JavaMail.e...
3         Message-ID: <13505866.1075863688222.JavaMail.e...
4         Message-ID: <30922949.1075863688243.JavaMail.e...
5         Message-ID: <30965995.1075863688265.JavaMail.e...
6         Message-ID: <16254169.1075863688286.JavaMail.e...
7         Message-ID: <17189699.1075863688308.JavaMail.e...
8         Message-ID: <20641191.1075855687472.JavaMail.e...
9         Message-ID: <30795301.1075855687494.JavaMail.e...
10        Message-ID: <33076797.1075855687515.JavaMail.e...
11        Message-ID: <25459584.1075855687536.JavaMail.e...
12        Message-ID: <13116875.1075855687561.JavaMail.e...
13        Message-ID: <2707340.1075855687584.JavaMail.ev...
14        Message-ID: <2465689.1075855687605.JavaMail.ev...
15        Message-ID: <1115198.1075855687626.JavaMail.ev...
16        Message-ID: <19773657.10758556