# 2. Model Training 

##### *Libraries Used*

In [7]:
%matplotlib inline
import numpy as np
import pandas as pd
from sqlite3 import dbapi2 as sq3
import os
import multiprocessing
from pprint import pprint
import itertools
from nltk.corpus import stopwords
import gensim
from random import randint
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

Using TensorFlow backend.


In [8]:
comp_db = sq3.connect('data/processed/comp_data.db')

In [9]:
company_pd = pd.read_sql("SELECT * FROM comp_descr;", comp_db)

In [10]:
company_pd.head()

Unnamed: 0,comp_ID,comp_op_name,comp_prof,prod_name,prim_NAICS_sec,prim_NAICS,prim_bus_activity,tot_sales,num_employees,country_of_ownership,exporting,year_estab
0,1,Agrideria,AGRIDERIA INDUSTRIAL IS AN INTERNATIONAL PROD...,Yellow Corn (Maize) Yellow Corn (Maize) For Hu...,Soybean Farming,111110,Manufacturer/ Processor/ Producer,"over 50,000,000",32.0,Foreign,Yes,2005.0
1,3,Thompsons Limited,"- Commodity handling & processing, Oilseeds (...","Barley, Chopped, Crushed Or Ground /Feed/ Whea...",Support Activitiesfor Crop Production,115110,Manufacturer/ Processor/ Producer,"over 50,000,000",250.0,Foreign,Yes,1924.0
2,4,Belle Pulses Ltd.,,Pulse Products Misc. Yellow & Green Split Peas...,Dry Peaand Bean Farming,111130,Manufacturer/ Processor/ Producer,,,Canada,Yes,
3,7,SunSelect Produce,"At SunSelect, we take our responsibility to o...","Tomato, Bell Pepper, Cucumber Year Round Produ...",Other Food Crops Grown Under Cover,111419,Manufacturer/ Processor/ Producer,,350.0,Canada,Yes,1985.0
4,9,Funk's Blueberries,Cultivate blueberries,Blueberries Blueberries,Non- Citrus Fruitand Tree Nut Farming,111330,Manufacturer/ Processor/ Producer,,,Canada,No,


In [593]:
fulldf = company_pd[['comp_ID', 'prim_NAICS_sec','comp_prof','prod_name']];fulldf.head()

Unnamed: 0,comp_ID,prim_NAICS_sec,comp_prof,prod_name
0,1,Soybean Farming,AGRIDERIA INDUSTRIAL IS AN INTERNATIONAL PROD...,Yellow Corn (Maize) Yellow Corn (Maize) For Hu...
1,3,Support Activitiesfor Crop Production,"- Commodity handling & processing, Oilseeds (...","Barley, Chopped, Crushed Or Ground /Feed/ Whea..."
2,4,Dry Peaand Bean Farming,,Pulse Products Misc. Yellow & Green Split Peas...
3,7,Other Food Crops Grown Under Cover,"At SunSelect, we take our responsibility to o...","Tomato, Bell Pepper, Cucumber Year Round Produ..."
4,9,Non- Citrus Fruitand Tree Nut Farming,Cultivate blueberries,Blueberries Blueberries


In [594]:
stop_words = set(stopwords.words("english"))

def sep_7(l):
    result = gensim.utils.simple_preprocess(l)
    result = [w for w in result if not w in stop_words]
    return result



In [595]:
fulldf['stem_words'] = fulldf[['prim_NAICS_sec', 'comp_prof','prod_name']].apply(lambda x: ' '.join(x),axis=1)
fulldf.head()

Unnamed: 0,comp_ID,prim_NAICS_sec,comp_prof,prod_name,stem_words
0,1,Soybean Farming,AGRIDERIA INDUSTRIAL IS AN INTERNATIONAL PROD...,Yellow Corn (Maize) Yellow Corn (Maize) For Hu...,Soybean Farming AGRIDERIA INDUSTRIAL IS AN IN...
1,3,Support Activitiesfor Crop Production,"- Commodity handling & processing, Oilseeds (...","Barley, Chopped, Crushed Or Ground /Feed/ Whea...",Support Activitiesfor Crop Production - Commo...
2,4,Dry Peaand Bean Farming,,Pulse Products Misc. Yellow & Green Split Peas...,Dry Peaand Bean Farming Pulse Products Misc. ...
3,7,Other Food Crops Grown Under Cover,"At SunSelect, we take our responsibility to o...","Tomato, Bell Pepper, Cucumber Year Round Produ...",Other Food Crops Grown Under Cover At SunSele...
4,9,Non- Citrus Fruitand Tree Nut Farming,Cultivate blueberries,Blueberries Blueberries,Non- Citrus Fruitand Tree Nut Farming Cultiva...


In [596]:
fulldf['stem_words'] = fulldf['stem_words'].apply(sep_7)

In [597]:
fulldf['comp_ID'] = fulldf['comp_ID'].str.split()

In [598]:
fulldf = fulldf[['comp_ID','stem_words']]; fulldf.head()

Unnamed: 0,comp_ID,stem_words
0,[00001],"[soybean, farming, agrideria, industrial, inte..."
1,[00003],"[support, activitiesfor, crop, production, com..."
2,[00004],"[dry, peaand, bean, farming, pulse, products, ..."
3,[00007],"[food, crops, grown, cover, sunselect, take, r..."
4,[00009],"[non, citrus, fruitand, tree, nut, farming, cu..."


### Define a `TaggedDocument` class to Preprocess Text
Below, we define an object class to associate each company ID to its respective description. 


In [599]:
class TaggedSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
        
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.TaggedDocument(doc, self.labels_list[idx])

    def to_array(self):
        self.sentences = []
        for idx, doc in enumerate(self.doc_list):
            self.sentences.append(gensim.models.doc2vec.TaggedDocument(doc, self.labels_list[idx]))
        return self.sentences
    
documents = TaggedSentence(fulldf.stem_words, fulldf.comp_ID)

In [600]:
documents.to_array()[0]

TaggedDocument(words=['soybean', 'farming', 'agrideria', 'industrial', 'international', 'producer', 'supplier', 'exporter', 'sought', 'agriculture', 'commodities', 'yellow', 'corn', 'soybean', 'soybean', 'meal', 'grains', 'end', 'buyer', 'willing', 'able', 'purchase', 'commodities', 'work', 'argentina', 'brazil', 'paraguay', 'uruguay', 'able', 'willing', 'supply', 'buyer', 'year', 'round', 'current', 'products', 'work', 'directly', 'producers', 'producer', 'associations', 'co', 'operatives', 'keep', 'prices', 'competitive', 'seeking', 'long', 'short', 'term', 'supplier', 'commodities', 'contact', 'us', 'save', 'time', 'money', 'working', 'genuine', 'seller', 'yellow', 'corn', 'maize', 'yellow', 'corn', 'maize', 'human', 'consumption', 'animal', 'feed', 'minimum', 'quantity', 'mt', 'maximum', 'quantity', 'mt', 'spot', 'year', 'contract', 'soybean', 'soybean', 'human', 'consumption', 'animal', 'feed', 'minimum', 'quantity', 'mt', 'maximum', 'quantity', 'mt', 'spot', 'year', 'contract', '

### Instantiate a Doc2Vec Object
Now, we'll instantiate a Doc2Vec model (PV-DBOW) with a vector size with 290 words and iterating over the training corpus 55 times. We set the minimum word count to 2 in order to give higher frequency words more weighting

In [697]:
cores = multiprocessing.cpu_count()

# PV-DBOW 
model = gensim.models.doc2vec.Doc2Vec(dm=0, size=200, min_count=2, iter=55, workers=cores)
model.build_vocab(documents.to_array())
print(str(model))

2018-06-24 21:56:29,485 - doc2vec - INFO - collecting all words and their counts
2018-06-24 21:56:29,487 - doc2vec - INFO - PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-06-24 21:56:29,677 - doc2vec - INFO - PROGRESS: at example #10000, processed 763381 words (4022511/s), 40913 word types, 10000 tags
2018-06-24 21:56:29,934 - doc2vec - INFO - PROGRESS: at example #20000, processed 1897617 words (4433218/s), 66173 word types, 20000 tags
2018-06-24 21:56:30,160 - doc2vec - INFO - PROGRESS: at example #30000, processed 2877825 words (4346666/s), 83627 word types, 30000 tags
2018-06-24 21:56:30,445 - doc2vec - INFO - PROGRESS: at example #40000, processed 4139545 words (4438121/s), 98564 word types, 40000 tags
2018-06-24 21:56:30,589 - doc2vec - INFO - collected 106182 word types and 46873 unique tags from a corpus of 46873 examples and 4818734 words
2018-06-24 21:56:30,590 - word2vec - INFO - Loading a fresh vocabulary
2018-06-24 21:56:30,716 - word2vec - INF

Doc2Vec(dbow,d200,n5,mc2,s0.001,t4)


Now we train Doc2Vec on the dataset corpus.

In [698]:
%time model.train(documents.to_array(), total_examples=model.corpus_count, epochs=model.iter)


2018-06-24 21:56:34,652 - word2vec - INFO - training model with 4 workers on 64789 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2018-06-24 21:56:35,658 - word2vec - INFO - PROGRESS: at 0.43% examples, 856555 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:36,662 - word2vec - INFO - PROGRESS: at 0.75% examples, 897319 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:37,662 - word2vec - INFO - PROGRESS: at 1.24% examples, 979367 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:38,662 - word2vec - INFO - PROGRESS: at 1.62% examples, 1059579 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:39,667 - word2vec - INFO - PROGRESS: at 2.17% examples, 1072562 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:40,671 - word2vec - INFO - PROGRESS: at 2.63% examples, 1101008 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:41,679 - word2vec - INFO - PROGRESS: at 3.10% examples, 1116433 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:56:42,690 - word2vec - INFO - 

2018-06-24 21:57:47,085 - word2vec - INFO - PROGRESS: at 32.66% examples, 1160569 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:48,093 - word2vec - INFO - PROGRESS: at 33.21% examples, 1159791 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:49,094 - word2vec - INFO - PROGRESS: at 33.62% examples, 1160466 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:50,107 - word2vec - INFO - PROGRESS: at 34.08% examples, 1160698 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:51,109 - word2vec - INFO - PROGRESS: at 34.50% examples, 1161540 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:52,118 - word2vec - INFO - PROGRESS: at 35.06% examples, 1160668 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:53,125 - word2vec - INFO - PROGRESS: at 35.48% examples, 1161702 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:54,131 - word2vec - INFO - PROGRESS: at 35.93% examples, 1161964 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:57:55,138 - word2vec - INFO - PROGRESS: at 36.33% examples, 116174

2018-06-24 21:58:59,480 - word2vec - INFO - PROGRESS: at 66.02% examples, 1170613 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:59:00,497 - word2vec - INFO - PROGRESS: at 66.45% examples, 1170955 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:59:01,499 - word2vec - INFO - PROGRESS: at 66.89% examples, 1171539 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:59:02,506 - word2vec - INFO - PROGRESS: at 67.36% examples, 1171417 words/s, in_qsize 8, out_qsize 0
2018-06-24 21:59:03,507 - word2vec - INFO - PROGRESS: at 67.88% examples, 1171076 words/s, in_qsize 8, out_qsize 0
2018-06-24 21:59:04,516 - word2vec - INFO - PROGRESS: at 68.32% examples, 1171437 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:59:05,520 - word2vec - INFO - PROGRESS: at 68.74% examples, 1172126 words/s, in_qsize 7, out_qsize 0
2018-06-24 21:59:06,534 - word2vec - INFO - PROGRESS: at 69.23% examples, 1171879 words/s, in_qsize 6, out_qsize 1
2018-06-24 21:59:07,540 - word2vec - INFO - PROGRESS: at 69.74% examples, 117183

2018-06-24 22:00:11,958 - word2vec - INFO - PROGRESS: at 99.48% examples, 1177144 words/s, in_qsize 7, out_qsize 0
2018-06-24 22:00:12,964 - word2vec - INFO - PROGRESS: at 99.88% examples, 1177199 words/s, in_qsize 7, out_qsize 0
2018-06-24 22:00:13,162 - word2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2018-06-24 22:00:13,166 - word2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2018-06-24 22:00:13,181 - word2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2018-06-24 22:00:13,185 - word2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2018-06-24 22:00:13,186 - word2vec - INFO - training on 265030370 raw words (257226188 effective words) took 218.5s, 1177116 effective words/s


CPU times: user 9min, sys: 19.4 s, total: 9min 20s
Wall time: 3min 39s


257226188

### Similarity interface

We'll pick for a randomly chosen company, and compare its similar companies by eye.

In [801]:
r = randint(0,46872)
c_ID = company_pd.iloc[r].comp_ID
sims = model.docvecs.most_similar(positive=[c_ID], topn=6)
print('Company ({}): «{}»\n'.format(company_pd.iloc[r].comp_op_name, company_pd.iloc[r].comp_prof))
print(u'SIMILAR BUSINESSES PER MODEL %s:\n' % model)
for index in range(len(sims)):
    print(u'%s: «%s»\n' % ((company_pd[company_pd.comp_ID ==\
                              sims[index][0]].comp_op_name.max(),sims[index][1]), \
                              company_pd[company_pd.comp_ID == sims[index][0]].comp_prof.max()))

Company (Abbotsford Security Services): « Abbotsford Security Services provides professionally-trained, uniformed security officers for financials institutions, corporate headquarters and office buildings, residential towers and developments, shopping centers, healthcare facilities, public and private events, and personal VIP Protection.»

SIMILAR BUSINESSES PER MODEL Doc2Vec(dbow,d200,n5,mc2,s0.001,t4):

('365 Patrol Ltd.', 0.5528758764266968): « 365 Patrol Ltd. provides Calgary security guards and security services to businesses, residences, and industry. We do construction site security, oilfield security services, private security, event security, and more.»

('NPSS', 0.5408617258071899): « We provide uniformed security officer services - Port facility Security, Event Security, construction security, etc. Loss Prevention officers, Private Investigation, document service, vehicle patrol»

('Groupe Conseil VCS', 0.5369068384170532): « doormans for bars.»

('ISS Security Ltd', 0.53510

#### Save Model

In [789]:
model.save('models/pv_dbow_final')
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

2018-06-27 00:00:37,893 - utils - INFO - saving Doc2Vec object under models/pv_dbow_final.bz2, separately None
2018-06-27 00:00:37,910 - utils - INFO - storing np array 'syn0' to models/pv_dbow_final.bz2.wv.syn0.npz
2018-06-27 00:00:40,366 - utils - INFO - not storing attribute syn0norm
2018-06-27 00:00:40,368 - utils - INFO - storing np array 'syn1neg' to models/pv_dbow_final.bz2.syn1neg.npz
2018-06-27 00:00:42,688 - utils - INFO - not storing attribute cum_table
2018-06-27 00:00:54,832 - utils - INFO - saved models/pv_dbow_final.bz2
