In [1]:
import nltk
from textblob import TextBlob as tb
import math
import io
import codecs
from __future__ import division
import numpy
import pandas as pd
import matplotlib
%matplotlib inline 


In [2]:
#http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

#computes "term frequency" which is the number of times a word appears in a document blob, 
#normalized by dividing by the total number of words in blob. 
def tf1(word, blob):
    return blob.words.count(word)

def tf2(word, blob):
    return blob.words.count(word) / len(blob.words)

#returns the number of documents containing word. A generator expression is passed to the sum() function.
def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

#computes "inverse document frequency" which measures how common a word is among all documents in bloblist. 
#The more common a word is, the lower its idf. 
#We take the ratio of the total number of documents to the number of documents containing word, 
#then take the log of that. Add 1 to the divisor to prevent division by zero.
def idf(word, bloblist):
     return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

#computes the TF-IDF score. It is simply the product of  tf and idf.
def tfidf(word, blob, bloblist):
    return tf2(word, blob) * idf(word, bloblist)

In [3]:
import sys
import re
import os
import string
import re
os.chdir('/users/asheets/Documents/Work_Computer_new/Work_Computer/Grad_School/PREDICT_453/Notebooks/DSI/')

all_docs = []
all_docs2 = ""
blob_list = []
d = 1

cachedStopWords = nltk.corpus.stopwords.words('english')
pattern = re.compile(r'\b(' + r'|'.join(cachedStopWords) + r')\b\s*')

for i in range(40):
    doc_name = 'DSI' + str(d) + '.txt'
    try:
        with open(doc_name, 'r') as f:
            sample = f.read()
        sample = sample.decode('utf-8')
       #sample = sample.decode('ascii')
        sample = sample.lower()
        sample = re.sub(r'[^\w]', ' ', sample)
        sample = ''.join([i for i in sample if not i.isdigit()])
        sample = pattern.sub('', sample)
        sample = "".join(l for l in sample if l not in string.punctuation)
        sample2 = " ".join(k for k in tb(sample).noun_phrases)
        all_docs.append({'DSInum': d, 'raw_text': sample, 'noun_phrases_only':
        sample2})
        all_docs2 = all_docs2 + sample2
        blob_list.append(tb(sample2))
        d = d + 1
    except IOError:
        d = d + 1
        pass

all_documents = pd.DataFrame(all_docs)


In [6]:
#compare my article
blob1 = tb(all_documents['noun_phrases_only'][23])
allblob = tb(all_docs2)
print 'there are' , len(blob1.words) , 'words in DSI 24'
print 'there are' , len(allblob.words) , 'words across the entire Corpus'
term_freq = [tf1(word,blob1) for word in blob1.words]
term_rel_freq = [tf2(word,blob1) for word in blob1.words]
all_docs_term_freq = [tf1(word,allblob) for word in blob1.words]
all_docs_term_rel_freq = [tf2(word,allblob) for word in blob1.words]
tf_df1 = pd.DataFrame({'word': blob1.words, 'all_docs_term_freq': all_docs_term_freq, 'doc1_term_freq': term_freq, 'all_docs_term_rel_freq': all_docs_term_rel_freq, 'doc1_term_rel_freq': term_rel_freq})

docs_containing1 = pd.DataFrame({'word': blob1.words, 'doc_freq': [n_containing(word,blob_list) for word in blob1.words]})

df = pd.merge(tf_df1,docs_containing1,on='word',how='outer').drop_duplicates()

df['intermediate_calc'] = (len(blob_list) / df['doc_freq']).astype(float)
df['idf'] = df['intermediate_calc'].apply(math.log)
df = df.sort_values("idf",ascending=True)

tfidf_df = pd.DataFrame({'word': blob1.words, 'tf_idf_doc1': [tfidf(word, blob1, blob_list) for word in blob1.words]})
tfidf_df_all = pd.DataFrame({'word': blob1.words, 'tf_idf_all_docs': [tfidf(word, allblob, blob_list) for word in blob1.words]})
tf_idf = pd.merge(tfidf_df,tfidf_df_all,on='word',how='outer').drop_duplicates()

df = pd.merge(df,tf_idf,on='word',how='outer').drop_duplicates()

df_final = df[["word","doc1_term_freq","all_docs_term_freq","all_docs_term_rel_freq","doc1_term_rel_freq","doc_freq","idf","tf_idf_all_docs","tf_idf_doc1"]]
df_final = df_final.sort_values(by=['tf_idf_doc1'], ascending=[False])
df_final.to_csv("/users/asheets/Documents/Work_Computer_new/Work_Computer/Grad_School/PREDICT_453/Notebooks/DSI24_tfidf.txt", sep='\t')
df_final = df_final.round(4)
df_final.sort_values(by=['tf_idf_doc1'], ascending=[False]).head(n=10)

there are 252 words in DSI 24
there are 11643 words across the entire Corpus


Unnamed: 0,word,doc1_term_freq,all_docs_term_freq,all_docs_term_rel_freq,doc1_term_rel_freq,doc_freq,idf,tf_idf_all_docs,tf_idf_doc1
40,energy,10,57,0.0049,0.0397,9,1.4404,0.0065,0.053
107,era,4,6,0.0005,0.0159,3,2.539,0.0012,0.0357
62,future,5,11,0.0009,0.0198,6,1.8458,0.0016,0.0336
17,order,8,50,0.0043,0.0317,14,0.9985,0.004,0.0295
76,coal,4,21,0.0018,0.0159,5,2.0281,0.0033,0.0293
63,emissions,4,18,0.0015,0.0159,6,1.8458,0.0026,0.0269
52,clean,4,18,0.0015,0.0159,7,1.6917,0.0024,0.0247
30,regulations,5,20,0.0017,0.0198,10,1.335,0.0021,0.0246
36,environmental,5,26,0.0022,0.0198,10,1.335,0.0028,0.0246
29,executive,5,20,0.0017,0.0198,11,1.2397,0.002,0.0229


In [7]:
##Compare just the two articles we know to be similar
blob2 = tb(all_documents['noun_phrases_only'][27])
term_freq = [tf1(word,blob2) for word in blob2.words]
term_rel_freq = [tf2(word,blob2) for word in blob2.words]
tf_df2 = pd.DataFrame({'word': blob2.words, 'doc2_term_freq': term_freq, 'doc2_term_rel_freq': term_rel_freq})
two_tf = pd.merge(tf_df1,tf_df2,on='word',how='outer').drop_duplicates()

docs_containing1 = pd.DataFrame({'word': blob1.words, 'doc_freq': [n_containing(word,blob_list) for word in blob1.words]})
docs_containing2 = pd.DataFrame({'word': blob2.words, 'doc_freq': [n_containing(word,blob_list) for word in blob2.words]})

doc_freq = pd.concat([docs_containing1,docs_containing2]).drop_duplicates().reset_index(drop=True)
df = pd.merge(doc_freq,two_tf,on='word',how='inner')
df = df.sort_values("doc_freq",ascending=False)

df['intermediate_calc'] = (len(blob_list) / df['doc_freq']).astype(float)
df['idf'] = df['intermediate_calc'].apply(math.log)
df = df.sort_values("idf",ascending=False)

scores1 = pd.DataFrame({'word': blob1.words, 'tf_idf_doc1': [tfidf(word, blob1, blob_list) for word in blob1.words]})
scores2 = pd.DataFrame({'word': blob2.words, 'tf_idf_doc2': [tfidf(word, blob2, blob_list) for word in blob2.words]})

two_tfidf= pd.merge(scores1,scores2,on='word',how='outer').drop_duplicates()
two_tfidf = two_tfidf[['word', 'tf_idf_doc1', 'tf_idf_doc2']]
two_tfidf = two_tfidf.sort_values(by=['tf_idf_doc1', 'tf_idf_doc2'], ascending=[False,False])
two_tfidf.head(n=15)


Unnamed: 0,word,tf_idf_doc1,tf_idf_doc2
192,energy,0.052976,0.041503
266,era,0.035735,0.011665
473,future,0.033565,
122,order,0.029509,0.03853
329,coal,0.029299,0.028692
388,emissions,0.026852,0.026295
422,clean,0.024732,0.040366
270,environmental,0.024597,0.032116
407,regulations,0.024597,0.012847
107,executive,0.022871,0.017917


In [5]:
#compare all DSIs sing all pre-defined functions
DSI_list = all_documents["DSInum"]
for i, blob in enumerate(blob_list):
    print("Top words in document {}".format(DSI_list[i]))
    scores = {word: tfidf(word, blob, blob_list) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:10]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
        
        

Top words in document 1
	Word: registry, TF-IDF: 0.06366
	Word: muslim, TF-IDF: 0.0549
	Word: immigration, TF-IDF: 0.03351
	Word: source, TF-IDF: 0.03289
	Word: risk, TF-IDF: 0.03289
	Word: ban, TF-IDF: 0.03289
	Word: countries, TF-IDF: 0.03238
	Word: terrorism, TF-IDF: 0.03183
	Word: muslims, TF-IDF: 0.03183
	Word: database, TF-IDF: 0.03183
Top words in document 2
	Word: nuclear, TF-IDF: 0.16906
	Word: arms, TF-IDF: 0.07044
	Word: race, TF-IDF: 0.04859
	Word: proliferation, TF-IDF: 0.04226
	Word: uranium, TF-IDF: 0.04226
	Word: treaty, TF-IDF: 0.04226
	Word: friday, TF-IDF: 0.04135
	Word: weapons, TF-IDF: 0.03644
	Word: corp, TF-IDF: 0.03644
	Word: comments, TF-IDF: 0.03533
Top words in document 3
	Word: act, TF-IDF: 0.06386
	Word: infrastructure, TF-IDF: 0.03026
	Word: ban, TF-IDF: 0.02494
	Word: corruption, TF-IDF: 0.02413
	Word: american, TF-IDF: 0.02127
	Word: workers, TF-IDF: 0.0208
	Word: clean, TF-IDF: 0.01916
	Word: education, TF-IDF: 0.01845
	Word: constitutional, TF-IDF: 0.0

In [35]:
RTV = pd.read_table('RTV.txt',sep='\t')

RTVblob = tb(str(tuple(RTV.Terms.tolist())).replace("'", ""))
RTV_final = list(RTV['Terms'])

tf_list = []

for i, blob in enumerate(blob_list):
    for item in RTV_final:
        tf_list.append({'DSInum': DSI_list[i], 'Terms': item, 'term_freq': blob.words.count(item)})

#print pd.DataFrame(tf_list)
tf_df = pd.DataFrame(tf_list)
tf_df2 = pd.merge(tf_df,RTV,on='Terms',how='inner')
tf_df_agg = tf_df2.groupby(['DSInum', 'Equivalence_Class']).sum()
tf_df_agg.to_csv("RTV_frequencies.txt", sep='\t')


KeyError: 'DSInum'

In [39]:
tf_df3 = pd.read_table('RTV_frequencies.txt',sep='\t')
new_tf = tf_df3[tf_df3['DSInum'] == 1]
new_tf = new_tf[["Equivalence_Class","term_freq"]]

for i in range(1,38):
    try:
        tmp = tf_df3[tf_df3['DSInum'] == DSI_list[i]]
        tmp = tmp[["Equivalence_Class","term_freq"]]
        new_tf = pd.merge(new_tf, tmp, on='Equivalence_Class', how='outer')
    except IOError:
        pass  


my_columns = ["Equivalence_Class"]
for i in DSI_list:
    my_columns.append('DSI' + str(i))
new_tf.columns = my_columns
new_tf.to_csv("RTV_frequencies_final.txt", sep='\t')
new_tf


Unnamed: 0,Equivalence_Class,DSI1,DSI2,DSI3,DSI4,DSI5,DSI6,DSI7,DSI8,DSI9,...,DSI29,DSI30,DSI31,DSI32,DSI33,DSI34,DSI35,DSI36,DSI37,DSI39
0,American Health Care Act,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,3,0,0,0
1,Consumer Permission,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Department of Homeland Security,1,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
3,EPA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
4,FCC,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0
5,FreeTrade,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Freedom Caucus,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Gerrymander,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,Healthcare,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,House of Representatives,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
