Things to do: 
1. Preserve TF-IDF Vectorizer for re-use (*done- can vectorize new document with same vocab*)
2. Create matrix of document distances 
    (*exists as document distances.  Needs to be able to expand to add new documents, or at least just measure new against existing 
        Content Hash is added to resource objects in database
        Distance Matrix cannot be loaded to DB as is, working to pickle to StringIO, store and retrieve from DB as binary object*)
3. Find thresholds for document matching

Other ways to compare documents
1. Word count
2. Iterations per word
3. Compare sentances: if sentance in other document & vice/versa

Concept to test: 
    If documents are very similar by TF-IDF vector, compare word-count and assume larger is the base document.  Compare iterations of every word between the two documents... what % of smaller doc's words are in the larger doc vs what % of larger doc's words that are in the smaller.  Last, cut smaller document into sentances, and see how many (%) of the sentances are found in the larger document.  
    
Older documents might be smaller, that they would grow over time

### Imports

In [1]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO 

In [112]:
from __future__ import division, print_function
import pickle
from StringIO import StringIO

#full packages
#import string
import numpy as np
import pandas as pd
import re
#import os
#import codecs

#sklearn tools
#from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances 
#from sklearn.manifold import MDS
#from sklearn.cluster import KMeans
#from sklearn.externals import joblib

#nltk tools
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords


#scipy tools
#from scipy.cluster.hierarchy import ward, dendrogram

#gensim
#from gensim import corpora, models, similarities 

In [3]:
import collections
from django.utils.encoding import smart_str, smart_unicode
def convertU(data):
    if isinstance(data, basestring):
        return smart_str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convertU, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convertU, data))
    else:
        return data

### Initialize Database

In [4]:
from pymongo import MongoClient
# Connect to the mongo local database
connection = MongoClient()
db = connection.bsa_files
collection = db.bsa_files

In [100]:
! python -c "import pymongo;print pymongo.__version__"
[method for method in dir(collection) if callable(getattr(collection, method))]

3.3.1


['_BaseObject__read_preference',
 '_Collection__create',
 '_Collection__create_index',
 '_Collection__database',
 '_Collection__find_and_modify',
 '__call__',
 '__class__',
 '__delattr__',
 '__eq__',
 '__format__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_command',
 '_count',
 '_delete',
 '_insert',
 '_insert_one',
 '_legacy_write',
 '_socket_for_primary_reads',
 '_socket_for_reads',
 '_socket_for_writes',
 '_update',
 'aggregate',
 'bulk_write',
 'count',
 'create_index',
 'create_indexes',
 'database',
 'delete_many',
 'delete_one',
 'distinct',
 'drop',
 'drop_index',
 'drop_indexes',
 'ensure_index',
 'find',
 'find_and_modify',
 'find_one',
 'find_one_and_delete',
 'find_one_and_replace',
 'find_one_and_update',
 'group',
 'index_information',
 'initialize_ordered_bulk_op',
 'initiali

### Build Text Corpus and perform Corpus wide TF-IDF

In [6]:
unique_content_hash_list = pickle.load(open('unique_content_hash_list.pickle','r'))
print(len(unique_content_hash_list))

4221


In [8]:
english_text_corpus = []
english_content_hash = []
spanish_text_corpus = []
spanish_content_hash = []

for unique_hash in unique_content_hash_list:
    document = collection.find_one({"Content_Hash":unique_hash})
    if document["Content"]!=None and len(document["Content"])>1:
        if document["Language"]=='english':
            english_text_corpus.append(document["Content"])
            english_content_hash.append(document["Content_Hash"])
        elif document["Language"]=='spanish':
            spanish_text_corpus.append(document["Content"])
            spanish_content_hash.append(document["Content_Hash"])

In [9]:
print(len(english_text_corpus),len(english_content_hash))
print(len(spanish_text_corpus),len(spanish_content_hash))

3979 3979
240 240


In [10]:
#special stopwords to remove common scouting terms and other erroneous text that continually showed up in the results
bsa_stop_words = ['scout','boy','cub','train','council',
                  'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

english_stopwords = stopwords.words('english') + bsa_stop_words
spanish_stopwords = stopwords.words('spanish') + bsa_stop_words + stopwords.words('english')

english_stemmer = SnowballStemmer("english")
spanish_stemmer = SnowballStemmer("spanish")

def tokenize_and_stem_english(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [english_stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_and_stem_spanish(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [spanish_stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [11]:
english_vocab_stemmed = []
english_vocab_tokenized = []
spanish_vocab_stemmed = []
spanish_vocab_tokenized = []

for e in english_text_corpus:
    words_stemmed = tokenize_and_stem_english(e)
    english_vocab_stemmed.extend(words_stemmed)
    words_tokenized = tokenize_only(e)
    english_vocab_tokenized.extend(words_tokenized)

for s in spanish_text_corpus:
    words_stemmed = tokenize_and_stem_spanish(s)
    spanish_vocab_stemmed.extend(words_stemmed)
    words_tokenized = tokenize_only(s)
    spanish_vocab_tokenized.extend(words_tokenized)

In [74]:
print(len(english_vocab_stemmed),len(english_vocab_tokenized))
print(len(spanish_vocab_stemmed),len(english_vocab_tokenized))

10947405 10947405
1986384 10947405


In [12]:
english_vocab_frame = pd.DataFrame({'words':english_vocab_tokenized}, index = english_vocab_stemmed)
print('There are {} items in english_vocab_frame'.format(str(english_vocab_frame.shape[0])))

spanish_vocab_frame = pd.DataFrame({'words':spanish_vocab_tokenized}, index = spanish_vocab_stemmed)
print('There are {} items in spanish_vocab_frame'.format(str(spanish_vocab_frame.shape[0])))

with open('english_vocab_frame2.pickle','w') as p:
    pickle.dump(english_vocab_frame,p)
with open('spanish_vocab_frame2.pickle','w') as q:
    pickle.dump(spanish_vocab_frame,q)

There are 10947405 items in english_vocab_frame
There are 1986384 items in spanish_vocab_frame


In [None]:
english_vocab_frame = pickle.load(open('english_vocab_frame2.pickle','r'))
spanish_vocab_frame = pickle.load(open('spanish_vocab_frame2.pickle','r'))

In [13]:
english_tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                           max_features=200000,
                                           min_df=0.2, 
                                           stop_words=english_stopwords,
                                           use_idf=True, 
                                           tokenizer=tokenize_and_stem_english, 
                                           ngram_range=(1,3))

spanish_tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                           max_features=200000,
                                           min_df=0.2, 
                                           stop_words=spanish_stopwords,
                                           use_idf=True, 
                                           tokenizer=tokenize_and_stem_spanish, 
                                           ngram_range=(1,3))

In [14]:
english_tfidf_matrix = english_tfidf_vectorizer.fit_transform(english_text_corpus)
print(english_tfidf_matrix.shape)

spanish_tfidf_matrix = spanish_tfidf_vectorizer.fit_transform(spanish_text_corpus)
print(spanish_tfidf_matrix.shape)

(3979, 313)
(240, 790)


In [16]:
english_terms = english_tfidf_vectorizer.get_feature_names()
spanish_terms = spanish_tfidf_vectorizer.get_feature_names()

In [21]:
print(len(english_terms),len(spanish_terms))

313 790


In [22]:
english_dist = 1-cosine_similarity(english_tfidf_matrix)
spanish_dist = 1-cosine_similarity(spanish_tfidf_matrix)

In [118]:
with open('english_distance2.pickle','w') as p:
    pickle.dump(english_dist,p)
with open('spanish_distance2.pickle','w') as q:
    pickle.dump(spanish_dist,q)

In [23]:
print(english_dist.shape)

(3979L, 3979L)


In [45]:
non_equals = []
for x in range (0,3979):
    if abs(english_dist[x][x]) > 0.00001:
        non_equals.append(x)
print(len(non_equals))
print(non_equals)

275
[3, 6, 21, 33, 39, 52, 99, 127, 131, 137, 164, 170, 189, 192, 202, 208, 227, 242, 259, 273, 279, 280, 281, 301, 341, 349, 362, 363, 387, 392, 394, 409, 416, 455, 482, 486, 501, 514, 517, 528, 567, 569, 576, 581, 585, 627, 628, 644, 670, 675, 676, 681, 690, 704, 720, 722, 738, 804, 808, 815, 818, 820, 828, 832, 853, 871, 891, 926, 944, 946, 955, 977, 987, 989, 1007, 1015, 1021, 1025, 1028, 1031, 1078, 1093, 1110, 1125, 1133, 1149, 1155, 1162, 1175, 1176, 1180, 1185, 1193, 1215, 1226, 1240, 1264, 1266, 1277, 1305, 1307, 1320, 1344, 1347, 1349, 1361, 1362, 1366, 1380, 1392, 1411, 1412, 1419, 1424, 1430, 1440, 1441, 1442, 1463, 1477, 1485, 1495, 1502, 1529, 1557, 1565, 1583, 1584, 1596, 1613, 1615, 1616, 1629, 1641, 1643, 1673, 1705, 1709, 1722, 1746, 1757, 1758, 1807, 1815, 1841, 1852, 1861, 1896, 1897, 1924, 1935, 1936, 1941, 1944, 1955, 1974, 1976, 1999, 2057, 2080, 2082, 2095, 2128, 2151, 2169, 2171, 2175, 2211, 2215, 2264, 2283, 2302, 2306, 2372, 2373, 2395, 2417, 2434, 2441, 2444

In [57]:
x = 3860
print(english_dist[0][x])
print(english_dist[x][0])
print(english_dist[x][x])
print(english_text_corpus[x])

1.0
1.0
1.0
˘ˇˆ˙˘˝˛ ˝˚˜ !"!# $% ˝&˝''˝ (˙˝( ˙˚˝% ) *)&+ )˙ ,˝ '˝'˜ ! - ˜" '˝% ˙˚ ˘˚ ! ˙˚ ˘''˚ ( ./012 ,˝˚˚ " .* - ˚ ))) 3-' , " /˚2% 4 +'-5&˚ 4-˝',,˜ ! ˝',˘" ˆ˝,˘ ! ˘!˚˝#+6 57 ,˚˝,˘" ˝,˘ 5 .˚'˝˚'"˝ ˝ - ,' 80 *˝ ˘˘ˇ˘ˇ +˝/(˙ 2" - ˘˚' /$%# ˝2 ˘!˚˝#+6 59 /$%# ˝2 .˝" '˝' ("˝˚ ''' ' ˚"˝ ˚ /$%'˝ '˝" ˜ !-!2 $! $! $! ˘!˚˝#+6 51 ˆˇ˙˝ˇ .&˝˚" ˚* '#˚ ˝* ˘'˚ ˚˚˛ ˝'"˚ ˝˝˘ .&" '˜˚ !#˝˝ 7:87 #˝ ;ˆ<˝''/ '2.˝ ˝"˝˝ #˝"˝ "˛˝"˜ !#"= -"=(/ ˝ !#"=!#"!#2 ˘!˚˝#+6 50 ˛" ; ˚<˝'' , ˚" ;<˝/>˙? 2 ˝ & ˝ ;˜<;@<˝˝˚ ˚'* # ' ˛ˇ˘ˇ "'" "˝' ''"˝ *˝ ˝' ˚˝ ˙ ', "˝ !#'


In [67]:
print(type(english_dist))

<type 'numpy.ndarray'>


In [19]:
print(type(english_content_hash), 
      type(english_text_corpus), 
      type(english_dist),
      type(english_dist[0]))

<type 'list'> <type 'list'> <type 'numpy.ndarray'> <type 'numpy.ndarray'>


In [43]:
print(english_content_hash[0])
print('*'*75)
print(type(english_text_corpus[0]))
print(english_text_corpus[0])
print('*'*75)
print(len(english_dist[0]))
print(english_dist[0])
print('*'*75)
print(type(english_tfidf_matrix[0]))
print(english_tfidf_matrix[0])

c7ba405cc29859106ac28a01c71ad03c5393644d
***************************************************************************
<type 'unicode'>
Solid to the Core Providing Resources to Support Units, Chapters, and Lodges Every once in a while when you™re working on a pioneer ing project, you™ll find a spar that looks great but that turns out to be weak and unreliable. Maybe its center has been eaten away by insects. Or maybe it has natural splits inside that you can't see. You can test a spar fo r soundness by holding one end and rapping the other end sharply on a rock. If it™s sound you'll hear it ring. Otherwise, you™ll want to toss it aside and find a good, solid spar to work with. Some people are like defective spars. They look great on the outside and they may have appealing personalities, the kind of guys and girls you think you would like to know. But when you do get to know them better, you find that they™re like a defective sparŠweak inside . They don™t have the strength of character to

### Test and Append TF-IDF array to DB Objects

In [50]:
temp_array = english_tfidf_matrix[0].toarray()
print(type(temp_array))
print(temp_array.shape)
print(len(temp_array[0]))
print(temp_array[0])

<type 'numpy.ndarray'>
(1L, 313L)
313
[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.11552897  0.          0.          0.          0.          0.          0.
  0.          0.          0.10422113  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          

In [57]:
simple_array = english_tfidf_matrix[0].toarray()[0].tolist()
print(type(simple_array))
print(len(simple_array))

<type 'list'>
313


In [58]:
collection.update_many({"Content_Hash":english_content_hash[0]},
                       {'$set':{"TFIDF_Vector":english_tfidf_matrix[0].toarray()[0].tolist()}})

<pymongo.results.UpdateResult at 0x16d738b40>

In [60]:
for x in range(0,len(english_content_hash)):
    collection.update_many({"Content_Hash":english_content_hash[x]},
                       {'$set':{"TFIDF_Vector":english_tfidf_matrix[x].toarray()[0].tolist()}})

In [61]:
print(collection.find_one({"Content_Hash":english_content_hash[-1]}))

{u'Domain': u'bsaseabase.org', u'Hash': u'acd7f9d3865638176146d2eaf8698313fdfc83b3', u'UID': u'BSA_631', u'Language': u'english', u'URL': u'http://bsaseabase.org/filestore/commissioner/pdf/commissioner_update_march2016.pdf', u'File_Name': u'Commissioner_Update_March2016.pdf', u'Content': u'Commissioner Update March 2016 Last week we announced a new report called In Progress Contacts that can be run from the Report tab on your district and council dashboards. It is located after the District Contact Stats 2016 report in the drop down list. The reason we added this report was so commissioners can better manage those contacts that have not been marked complete. The report can be us ed by administrative commissioners to insure work recorded is not lost. Once a contact is started a commissioner or professional has 60 days to complete it. This limitation was set to encourage timely action be taken which will in turn allow for administ rative commissioners and district executives the ability 

In [62]:
for x in range(0,len(spanish_content_hash)):
    collection.update_many({"Content_Hash":spanish_content_hash[x]},
                       {'$set':{"TFIDF_Vector":spanish_tfidf_matrix[x].toarray()[0].tolist()}})

In [63]:
print(collection.find_one({"Content_Hash":spanish_content_hash[-1]}))

{u'Domain': u'bsaseabase.org', u'Hash': u'8fa8d904b9ba9f0326473ad594b85cc44af96e62', u'UID': u'BSA_1806', u'Language': u'spanish', u'URL': u'http://bsaseabase.org/filestore/marketing/310-740_spn/venturing/310-740-52_fliers/pdfs/310-740-52-3_fill.pdf', u'File_Name': u'310-740-52-3_fill.pdf', u'Folder': u'PDFs', u'Content': u'Toma sel\u02dces. Sirve al pr\xf3jimo.Obt\xe9n con\u02dcanza.Construye amistades.Toma riesgos. Convive.Haz la diferencia.Busca oportunidades.Deja un legado.Orienta a otros.Persigue la aventura.Ampl\xeda tus horizontes.AQU\xcd COMIENZA LA AVENTURA EN SEUNSCOUT.ORG. LA BASE PARA UN GRAN FUTURO.', u'Document_Class': u'Marketing Experience Mgmt', u'File_Size': 4217834, u'TFIDF_Vector': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [69]:
array_test = np.array(collection.find_one({"Content_Hash":english_content_hash[-1]})["TFIDF_Vector"])
print(type(array_test))
print(len(array_test))
print(array_test)

<type 'numpy.ndarray'>
313
[ 0.          0.          0.          0.          0.06688394  0.          0.
  0.          0.          0.          0.          0.          0.18101502
  0.          0.04859089  0.          0.          0.          0.          0.
  0.          0.          0.          0.12746195  0.          0.          0.
  0.          0.11186231  0.          0.          0.          0.
  0.06322667  0.          0.          0.          0.          0.
  0.05629388  0.          0.          0.0633849   0.          0.          0.
  0.          0.          0.0588243   0.          0.          0.          0.
  0.05873524  0.          0.          0.          0.06110447  0.          0.
  0.          0.23123829  0.          0.          0.49998184  0.          0.
  0.          0.          0.          0.05820701  0.          0.          0.
  0.10955114  0.          0.1352349   0.          0.          0.          0.
  0.          0.18094425  0.          0.          0.          0.          0.


In [23]:
cursor = collection.find({"TFIDF_Vector":{"$exists":False}})
count = 0
document_hashes = []
for document in cursor:
    try:
        document_hashes.append(document['Hash'])
    except:
        continue
print(len(document_hashes))
print(len(set(document_hashes)))

3116
440


In [24]:
cursor = collection.find({"TFIDF_Vector":{"$exists":True}})
count = 0
document_hashes = []
for document in cursor:
    try:
        document_hashes.append(document['Hash'])
    except:
        continue
print(len(document_hashes))
print(len(set(document_hashes)))

33437
5106


### Preserve TF-IDF Vectorizer(s) for use with future documents

In [77]:
test_document = english_text_corpus[0]
#print(test_document)
test_list = []
test_list.append(test_document)
print(test_list)

[u"Solid to the Core Providing Resources to Support Units, Chapters, and Lodges Every once in a while when you\u2122re working on a pioneer ing project, you\u2122ll find a spar that looks great but that turns out to be weak and unreliable. Maybe its center has been eaten away by insects. Or maybe it has natural splits inside that you can't see. You can test a spar fo r soundness by holding one end and rapping the other end sharply on a rock. If it\u2122s sound you'll hear it ring. Otherwise, you\u2122ll want to toss it aside and find a good, solid spar to work with. Some people are like defective spars. They look great on the outside and they may have appealing personalities, the kind of guys and girls you think you would like to know. But when you do get to know them better, you find that they\u2122re like a defective spar\u0160weak inside . They don\u2122t have the strength of character to resist things that you know are wrong, and chances are they will want you to do those things, t

In [98]:
alt_english_tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                           max_features=200000,
                                           min_df=0.2, 
                                           stop_words=english_stopwords,
                                           use_idf=True, 
                                           tokenizer=tokenize_and_stem_english, 
                                           ngram_range=(1,3),
                                           vocabulary = collection.find_one({"Document_Class":"english_resources"})["TFIDF_Vocabulary"])

In [99]:
test_tfidf_matrix = alt_english_tfidf_vectorizer.fit_transform(test_list)#english_text_corpus)
#print(test_tfidf_matrix)
print(test_tfidf_matrix == english_tfidf_matrix[0])

  (0, 0)	True
  (0, 1)	True
  (0, 2)	True
  (0, 3)	True
  (0, 4)	True
  (0, 5)	True
  (0, 6)	True
  (0, 7)	True
  (0, 8)	True
  (0, 9)	True
  (0, 10)	True
  (0, 11)	True
  (0, 12)	True
  (0, 13)	True
  (0, 14)	True
  (0, 15)	True
  (0, 16)	True
  (0, 17)	True
  (0, 18)	True
  (0, 19)	True
  (0, 20)	True
  (0, 21)	True
  (0, 22)	True
  (0, 23)	True
  (0, 24)	True
  :	:
  (0, 283)	True
  (0, 284)	True
  (0, 286)	True
  (0, 287)	True
  (0, 288)	True
  (0, 290)	True
  (0, 291)	True
  (0, 292)	True
  (0, 293)	True
  (0, 294)	True
  (0, 295)	True
  (0, 296)	True
  (0, 297)	True
  (0, 298)	True
  (0, 300)	True
  (0, 301)	True
  (0, 302)	True
  (0, 303)	True
  (0, 304)	True
  (0, 305)	True
  (0, 306)	True
  (0, 308)	True
  (0, 310)	True
  (0, 311)	True
  (0, 312)	True


In [105]:
#collection.update_one({"Document_Class":"resources"},{"$set":{"Document_Class":"english_resources"}})
#collection.update_one({"Document_Class":"english_resources"},{"$set":{"TFIDF_Vocabulary":english_terms}})
#collection.update_one({"Document_Class":"english_resources"},{"$set":{"existing_content_hash":english_content_hash}})


<pymongo.results.UpdateResult at 0x167131ea0>

In [111]:
new_test = collection.find_one({"Document_Class":"english_resources"})
print(new_test.keys())
print(len(new_test["existing_content_hash"]))

[u'Document_Class', u'_id', u'TFIDF_Vocabulary', u'existing_content_hash']
3979


In [102]:
spanish_resource_object = {"Document_Class":"spanish_resources","TFIDF_Vocabulary":spanish_terms}

In [103]:
collection.insert_one(spanish_resource_object)

<pymongo.results.InsertOneResult at 0x16712b168>

In [107]:
#collection.update_one({"Document_Class":"spanish_resources"},{"$set":{"TFIDF_Vocabulary":spanish_terms}})
collection.update_one({"Document_Class":"spanish_resources"},{"$set":{"existing_content_hash":spanish_content_hash}})



<pymongo.results.UpdateResult at 0x16712b2d0>

In [110]:
result = collection.find_one({"Document_Class":"spanish_resources"})
print(result.keys())
print(len(result["existing_content_hash"]))

[u'Document_Class', u'_id', u'TFIDF_Vocabulary', u'existing_content_hash']
240


In [113]:
string_io_test = StringIO()
pickle.dump(english_dist, string_io_test)

In [117]:
test_result = pickle.load(StringIO(string_io_test))
print(type(test_result))

KeyError: '<'