# 1. Extract whole textual file to daily contents

In [1]:
doc = open('Seattle_Reports.txt', 'rb')
text = doc.read()
head = '''CONTRACTORS QUALITY CONTROL REPORT (QCR)
DAILY LOG OF CONSTRUCTION - MILITARY'''

In [2]:
contents = {} #a dict to store the reports. key: file number; value: contents of the corresponding report
fileNum_reptNum = {} #key: file number, value: report number
fileNum_date = {} #key: file number, value: date of the report
start = 0

import re

fileNum = 0

curPos = text.find(head, start)# get the position of the start of a report
for i in range(0, 2027): #2027 pages
    start = curPos + 1
    nextPos = text.find(head, start)
    content = text[curPos+len(head) : nextPos] #get contents of reports
    curPos = nextPos
    
    if r'Page 1 of' in content:
        fileNum += 1
        repNum = text[text.find(r'Page 1 of', start)-7 : text.find(r'Page 1 of', start)-2].replace('\n','').replace('R','')
        fileNum_reptNum[fileNum] = repNum
        
        date = text[text.find(r'DATE', start)+4 : text.find(r'PROJECT', start)].replace('\n', '')
        fileNum_date[fileNum] = date  
        
        contents[fileNum] = content
    else:
        contents[fileNum] = contents.get(fileNum)+content
        
    #print contents[fileNum]
print len(contents), '\n', fileNum

881 
881


# 2. Obtain the set of stop words

In [3]:
ehab_stop_words = []

for word in open(r"ehabs_stop_words.txt",'r'):
    ehab_stop_words.append(word.strip())

from sklearn.feature_extraction import text 

my_stop_words = text.ENGLISH_STOP_WORDS.union(ehab_stop_words) # union the ehab's stop words and the stop words in the pachage

# 3. Convert reports into a matrix of word frequency. This step results in a matrix of 6782 words by 881 reports. 

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words=my_stop_words, analyzer='word')
X_train_counts = count_vect.fit_transform(contents.values()) # counting the words in every reports
#X_train_counts.shape 

In [5]:
inv_dic = {v: k for k, v in count_vect.vocabulary_.iteritems()}

# 4. Re-represent reports with a matrix with TF-IDF. 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words=my_stop_words, analyzer='word') 
X_train_tfidf=tfidf.fit_transform(contents.values())

In [7]:
from sklearn.cluster import KMeans
kmeans_word_tfidf = KMeans(n_clusters=10, init='random').fit(X_train_tfidf.transpose()) #clustering words

# 5. Save the matrix of word frequency to a file words_counts_doc.csv

In [8]:
import csv
import numpy
import re

with open('words_counts_doc.csv', 'wb') as f:
    writer = csv.writer(f)
    counts_toarray = X_train_counts.transpose().toarray()
    counter = 0
    writer.writerow(['word_number', 'word_label', 'word'])
    for word_num in range(0,6782):
        if not re.match(r'[a-z]*[\d]+[a-z]*', inv_dic[counter]): #exclude the words containing numbers
            counts_row = counts_toarray[word_num].ravel()
            writer.writerow(numpy.append([counter, kmeans_word_tfidf.labels_[counter], inv_dic[counter].encode('utf-8')], counts_row))
        counter += 1
    f.close()

# 6. Save the matrix of TF-IDF to a file words_tfidf.csv

In [9]:
with open('words_tfidf.csv', 'wb') as f:
    writer = csv.writer(f)
    tfidf_toarray = X_train_tfidf.transpose().toarray()
    counter = 0
    writer.writerow(['word_number', 'word_label', 'word'])
    for word_num in range(0,6782):
        if not re.match(r'[a-z]*[\d]+[a-z]*', inv_dic[counter]): #exclude the words containing numbers
            tfidf_row = tfidf_toarray[word_num].ravel()
            writer.writerow(numpy.append([counter, kmeans_word_tfidf.labels_[counter], inv_dic[counter].encode('utf-8')], tfidf_row))
        counter += 1
    f.close()

In [27]:
#print counts_toarray[1408].ravel()

[2 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 2 2 1 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 1 1
 0 1 0 1 1 1 0 0 1 1 1 1 0 2 1 1 0 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 0
 0 1 1 1 1 0 2 1 1 2 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1
 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 1
 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [29]:
#print tfidf_toarray[1408].ravel()

[ 0.15378242  0.05950127  0.08472414  0.08451068  0.08537696  0.
  0.08060372  0.07650611  0.07152997  0.07507595  0.08439237  0.06768123
  0.0526512   0.06670933  0.05411764  0.06818517  0.          0.05994355
  0.06394068  0.          0.05649891  0.05386026  0.06228488  0.
  0.04865489  0.05016467  0.0479381   0.04722444  0.          0.04046313
  0.04219505  0.03837212  0.0437995   0.04921116  0.04568467  0.04859322
  0.04151164  0.05208733  0.05507306  0.12412487  0.09565195  0.02803787
  0.          0.          0.04900426  0.04370436  0.          0.          0.
  0.          0.          0.          0.04553469  0.04507099  0.04370429
  0.          0.          0.04532671  0.          0.          0.          0.
  0.04204527  0.04635155  0.04685844  0.          0.          0.03412059
  0.0391355   0.03760459  0.04372262  0.          0.03943014  0.03539594
  0.          0.04384833  0.          0.03960106  0.04037027  0.02934158
  0.          0.          0.02138647  0.04381216  0.0351379