# Cycle through each year and extract the major topics

In [111]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
from sklearn.externals import joblib
from sklearn import decomposition


In [112]:
%matplotlib inline

In [113]:
total_bldgsim = pd.read_pickle("/Users/nus/twenty-years-of-bldgsim-textmining/total_email_data.pkl")

In [114]:
total_bldgsim.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20029 entries, 1999-03-04 08:26:46 to 2018-11-08 18:42:22
Data columns (total 4 columns):
From        20029 non-null object
DateTime    20029 non-null datetime64[ns]
Subject     20029 non-null object
Body        20028 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 782.4+ KB


# Loop through each year and calculate the tf-idf and the topics for each year

In [115]:
def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [116]:
def get_descriptor( terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( terms[term_index] )
    return top_terms

In [117]:
k = 10

# Run through each year, get the top 10 topics and their top 10 words

Put them in a dataframe with the year

In [118]:
all_topics = []

In [119]:
for year in total_bldgsim.index.year.unique():
    bldg_sim_peryear = total_bldgsim[total_bldgsim.index.year == year]
    out_array = np.array(bldg_sim_peryear.fillna("x").Body)
    custom_stop_words = []
    with open( "/Users/nus/twenty-years-of-bldgsim-textmining/stopwords_annual_analysis_3.txt", "r" ) as fin:
        for line in fin.readlines():
            custom_stop_words.append( line.strip() )
    print(year)
    
    # use a custom stopwords list, set the minimum term-document frequency to 20
    vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 20, encoding='latin-1')
    A = vectorizer.fit_transform(out_array)
    print( "Created %d X %d document-term matrix" % (A.shape[0], A.shape[1]) )
    
    # extract the resulting vocabulary
    terms = vectorizer.get_feature_names()
    print("Vocabulary has %d distinct terms" % len(terms))
    
    # we can pass in the same preprocessing parameters
    vectorizer = TfidfVectorizer(stop_words=custom_stop_words, min_df = 20, encoding='latin-1')
    A = vectorizer.fit_transform(out_array)
    print("Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )
    
    joblib.dump((A,terms), "/Users/nus/twenty-years-of-bldgsim-textmining/emails-raw"+str(year)+".pkl") 
    
    (A,terms) = joblib.load( "/Users/nus/twenty-years-of-bldgsim-textmining/emails-raw"+str(year)+".pkl" )
    print( "Loaded %d X %d document-term matrix" % (A.shape[0], A.shape[1]) )
    
    model = decomposition.NMF( init="nndsvd", n_components=k ) 
    # apply the model and extract the two factor matrices
    W = model.fit_transform( A )
    H = model.components_
    
    descriptors = []
    for topic_index in range(k):
        descriptors.append( get_descriptor( terms, H, topic_index, 10) )
        str_descriptor = ", ".join( descriptors[topic_index] )
        print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )
        
    descriptors_df = pd.DataFrame(descriptors).T
    descriptors_df['year'] = year
    all_topics.append(descriptors_df)

1999
Created 152 X 42 document-term matrix
Vocabulary has 42 distinct terms
Created 152 X 42 TF-IDF-normalized document-term matrix
Loaded 152 X 42 document-term matrix
Topic 01: doe, 1e, user, version, 0400, input, text, experience, 0700, project
Topic 02: ashrae, bill, 0400, large, interested, 0500, jun, looking, references, text
Topic 03: jason, administrator, consultant, analytics, user, wrote, 0600, input, 0500, text
Topic 04: number, jun, see, large, bill, better, hourly, 04, phone, doe
Topic 05: systems, better, available, source, text, large, web, tools, 04, interested
Topic 06: thermal, program, interested, web, tools, project, phone, 0500, see, version
Topic 07: software, experience, looking, 0600, better, tools, 04, web, program, version
Topic 08: source, hourly, wrote, good, 0700, 1e, interested, references, looking, program
Topic 09: research, project, 0700, phone, good, jun, program, experience, available, analytics
Topic 10: information, interested, looking, site, web, g

Created 2217 X 2707 document-term matrix
Vocabulary has 2707 distinct terms
Created 2217 X 2707 TF-IDF-normalized document-term matrix
Loaded 2217 X 2707 document-term matrix
Topic 01: loop, heat, pump, water, chiller, boiler, karen, heating, cooling, walkerman
Topic 02: software, developing, studioma, modeling, rosiers, des, robert, studio, recommendations, 251
Topic 03: baseline, leed, existing, orientation, appendix, ashrae, addendum, addenda, proposed, orientations
Topic 04: elevator, vikram, elevators, sami, direct, schedule, lordaecksargent, lighting, leed, aeck
Topic 05: bb, jeff, haberl, tamu, texas, esl, laboratory, blown, jhaberl, 3581
Topic 06: load, unmet, hours, loads, cooling, zones, process, glass, heating, met
Topic 07: 206, hargis, brandon, nichols, 448, biz, stewart, 8707, 3376, 228
Topic 08: fan, nexant, vav, john, 626, 430, aulbach, jaulbach, powered, power
Topic 09: ramana, koti, bnim, elements, 64105, kansas, division, missouri, 1635, renewable
Topic 10: equest, y

Created 846 X 2116 document-term matrix
Vocabulary has 2116 distinct terms
Created 846 X 2116 TF-IDF-normalized document-term matrix
Loaded 846 X 2116 document-term matrix
Topic 01: actual, app, models, jacob, appendix, iesve, values, nathan, leed, eskewdumezripple
Topic 02: designbuilder, training, days, architects, energyplus, course, registration, engineers, attend, performance
Topic 03: julien, lighting, gouv, dutel, des, class, qc, vous, est, space
Topic 04: utility, bills, plant, days, april, period, wsp, month, degree, billing
Topic 05: certification, experience, technical, usgbc, leed, skills, team, preferred, engineering, position
Topic 06: weather, whiteboxtechnologies, tmy3, joe, climate, wichert, epw, huang, white, web
Topic 07: ibpsa, usa, news, newsletter, subscribe, conference, latest, ac, uk, 2015
Topic 08: vrf, sefaira, cooling, curves, yahoo, heating, fan, air, doas, heat
Topic 09: openstudio, radiance, diva, mit, studio, open, daylighting, energyplus, daylight, chris

In [120]:
all_topics_df = pd.concat(all_topics).reset_index(drop=True)

In [121]:
all_topics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 11 columns):
0       200 non-null object
1       200 non-null object
2       200 non-null object
3       200 non-null object
4       200 non-null object
5       200 non-null object
6       200 non-null object
7       200 non-null object
8       200 non-null object
9       200 non-null object
year    200 non-null int64
dtypes: int64(1), object(10)
memory usage: 17.3+ KB


In [122]:
all_topics_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,year
0,doe,ashrae,jason,number,systems,thermal,software,source,research,information,1999
1,1e,bill,administrator,jun,better,program,experience,hourly,project,interested,1999
2,user,0400,consultant,see,available,interested,looking,wrote,0700,looking,1999
3,version,large,analytics,large,source,web,0600,good,phone,site,1999
4,0400,interested,user,bill,text,tools,better,0700,good,web,1999
5,input,0500,wrote,better,large,project,tools,1e,jun,good,1999
6,text,jun,0600,hourly,web,phone,04,interested,program,bill,1999
7,experience,looking,input,04,tools,0500,web,references,experience,available,1999
8,0700,references,0500,phone,04,see,program,looking,available,tools,1999
9,project,text,text,doe,interested,version,version,program,analytics,phone,1999


In [123]:
all_topics_df_melted = all_topics_df.melt(id_vars='year')

In [124]:
all_topics_df_melted.head()

Unnamed: 0,year,variable,value
0,1999,0,doe
1,1999,0,1e
2,1999,0,user
3,1999,0,version
4,1999,0,0400


In [125]:
all_topics_df_melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
year        2000 non-null int64
variable    2000 non-null object
value       2000 non-null object
dtypes: int64(1), object(2)
memory usage: 46.9+ KB


In [126]:
wordfreq = all_topics_df_melted.value.value_counts()

In [127]:
wordfreq.head(40)

leed           24
energyplus     20
cooling        20
ashrae         18
heat           18
weather        18
equest         17
performance    16
software       16
air            15
baseline       15
doe            15
modeling       15
experience     15
source         14
program        14
fan            14
ibpsa          14
available      13
water          13
engineering    13
information    12
web            12
systems        12
heating        12
usa            12
thermal        12
temperature    11
doe2           11
0500           11
training       10
conference     10
university     10
joe            10
load           10
yahoo           9
jeff            9
research        9
version         9
interested      8
Name: value, dtype: int64

In [128]:
wordfreq

leed                   24
energyplus             20
cooling                20
ashrae                 18
heat                   18
weather                18
equest                 17
performance            16
software               16
air                    15
baseline               15
doe                    15
modeling               15
experience             15
source                 14
program                14
fan                    14
ibpsa                  14
available              13
water                  13
engineering            13
information            12
web                    12
systems                12
heating                12
usa                    12
thermal                12
temperature            11
doe2                   11
0500                   11
                       ..
qm                      1
espr                    1
recommendations         1
dcmit_esiyok            1
qc                      1
lexington               1
bat                     1
eskewdumezri

In [129]:
wordfreq.to_csv("word_frequency.csv")