In [1]:
author = 'AJ Wilson'
from datetime import datetime
print("Prepared by: {}".format(author))
print("Last Edit: {}".format(datetime.now()))
!python -V

Prepared by: AJ Wilson
Last Edit: 2017-02-15 15:19:51.808000


Python 2.7.12 :: Anaconda custom (64-bit)


***
# **Document Clustering and Matching**
## Unsupervised Learning Test 2


In [2]:
import pandas as pd
import numpy as np
import cPickle as pickle
import re
import string

#sklearn
from sklearn.preprocessing import StandardScaler, Normalizer, MaxAbsScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

#scipy
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import pearsonr
from scipy.sparse import csr_matrix

#matplotlib
import matplotlib.pyplot as plt

#nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

In [3]:
#location in other project folder where the source documents are
file_path = 'C:\\Users\\ajwil\\OneDrive\\Documents\\00_public\\Group Projects\\'

In [23]:
#mongod.exe --dbpath D:\mongodata
from pymongo import MongoClient

connection = MongoClient()
db = connection.bsa_files
collection = db.bsa_files

In [5]:
unique_content_hash_list = pickle.load(open((file_path+'unique_content_hash_list.pickle'),'r'))
len(unique_content_hash_list),type(unique_content_hash_list)

(4221, list)

In [6]:
full_text_corpus = []
document_names = []
used_content_hashes = []
discarded_content_hashes = []

for item in unique_content_hash_list:
    document = collection.find_one({'Content_Hash':item})
    
    if document['Content'] != None and len(document['Content'])>1 and len(document['Content'])<500000 and document['Language']=='english':
        full_text_corpus.append(document['Content'])
        document_names.append(document['File_Name'])
        used_content_hashes.append(item)
    else:
        discarded_content_hashes.append(item)
     

In [7]:
len(discarded_content_hashes)

260

In [8]:
len(full_text_corpus), len(document_names)

(3961, 3961)

In [9]:
extra_stop_words = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

my_stopwords = stopwords.words('english') + stopwords.words('spanish') + extra_stop_words

stemmer = SnowballStemmer('english')

In [10]:
def tokenize(text):
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'[0-9_]','',text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and token not in my_stopwords and len(token)<30: #30 = len(longest english word)
            filtered_tokens.append(token)
    return filtered_tokens

In [43]:
tfidf = TfidfVectorizer(tokenizer=tokenize)

In [44]:
csr_mat = tfidf.fit_transform(full_text_corpus)

In [45]:
csr_mat.toarray().shape

(3961L, 191419L)

In [14]:
words = tfidf.get_feature_names()

In [15]:
scaler = MaxAbsScaler()
nmf = NMF(n_components = 25)
normalizer = Normalizer()

In [16]:
pipeline = make_pipeline(scaler, nmf, normalizer)

In [17]:
norm_features = pipeline.fit_transform(csr_mat)

In [18]:
norm_features.shape

(3961L, 25L)

In [19]:
df = pd.DataFrame(norm_features, index = document_names)

In [20]:
document = df.loc[document_names[50]]

In [21]:
similarities = df.dot(document)
similarities.nlargest(10)

Accounts_Payable_Year_End_2015.pdf        1.000000
Year_End_Accounts%20Payable_2013.pdf      0.998363
GL_Summary_Loading_PR_%208-3-11.pdf       0.989468
Record_Product_Sales.pdf                  0.985634
fm_pro_for_ste.pdf                        0.983687
JTE_Glossary.pdf                          0.979944
Gift-Policies-Procedures-BSAF-2014.pdf    0.979682
PeopleSoft_User_Group_20150909.pdf        0.979543
Chapter_13_Asset_Management.pdf           0.979366
bp-newsletter-gift-chart.pdf              0.978068
dtype: float64

In [22]:
document = df.loc[document_names[15]]
similarities = df.dot(document)
similarities.nlargest(10)

Camping.pdf                   1.000000
Camping_previous.pdf          0.999717
Backpacking_previous.pdf      0.998194
Backpacking.pdf               0.998068
Fly-Fishing.pdf               0.989535
Woodwork.pdf                  0.985105
Automotive_Maintenance.pdf    0.982050
Fire_Safety.pdf               0.980162
Crime_Prevention.pdf          0.980143
First_Aid.pdf                 0.979337
dtype: float64

In [32]:
#temp1 = pickle.dumps(csr_mat)
#temp2 = pickle.loads(temp1)
#temp2.toarray() == csr_mat.toarray()

In [31]:
#temp1 = pickle.dumps(df)
#temp2 = pickle.loads(temp1)
#temp2 == df

In [34]:
collection.update_one({"Document_Class":"english_resources"},{"$set":{"TFIDF_Vocabulary":words}})

<pymongo.results.UpdateResult at 0x1d325828>

In [35]:
len(collection.find_one({"Document_Class":"english_resources"})["TFIDF_Vocabulary"])

191419

In [48]:
tfidf2 = TfidfVectorizer(tokenizer=tokenize, vocabulary = collection.find_one({"Document_Class":"english_resources"})["TFIDF_Vocabulary"])
test1 = tfidf.transform(full_text_corpus[0:2])
test2 = tfidf2.fit_transform(full_text_corpus[0:2])
test1.toarray() == test2.toarray()
#test1.shape, test2.shape

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)