## Pipeline 3: Train

### Load Packages

In [1]:
from os import chdir
chdir('../') # make sure you only run this once

In [2]:
import pandas as pd
import lib.database_module as db
import lib.encoding_module as enc
import lib.wiki_module as wiki
from lib.encoding_module import get_searchterm_vector, get_page_vector

### Mine: Select document vectors for all pages from database

In [3]:
db.connect_to_postgres()

Connected to server joshuacook.me.


(<connection object at 0x105566050; dsn: 'user=alex password=xxxxx people captain cell' host=joshuacook.me port=5432 dbname=project_6', closed: 0>,
 <cursor object at 0x11605c240; closed: 0>)

In [4]:
page_vectors = db.select_all_page_vectors()

Connected to server joshuacook.me.


In [5]:
indices = [tup[0] for tup in page_vectors]
vectors = [tup[1] for tup in page_vectors]

In [6]:
page_vectors_df = pd.DataFrame(vectors, index=indices)
page_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
4341789,0.092992,-0.040985,0.006952,0.000243,0.001681,0.004999,0.00343,0.003523,-0.005129,-0.001911,...,-0.021003,-0.039528,0.027716,0.021033,-0.034768,-0.022294,0.031537,-0.00971,-0.010331,0.025944
48201744,0.021301,0.027559,-0.003917,-0.010877,0.013055,0.000934,0.000128,0.010317,0.001444,-0.009792,...,-0.05504,0.00011,0.057817,0.005156,0.058477,0.008807,0.043527,-0.011693,0.064738,0.051296
2514975,0.124983,0.153025,0.026264,-0.257191,-0.210183,0.074847,0.003862,-0.009666,-0.065705,-0.280972,...,-0.022703,0.015894,-0.002476,0.004992,-0.006653,-0.017394,-0.008525,0.017188,-0.001596,0.00012
35135520,0.191357,-0.039003,-0.006018,0.006389,-0.001106,-0.004748,0.015829,0.005612,0.011846,0.028248,...,0.000578,-0.004204,0.030916,0.010484,-0.012454,0.02605,0.038744,0.015332,-0.008053,0.009506
27303975,0.034292,0.039499,-0.006095,-0.015156,0.026112,-0.039749,-0.08144,-0.036546,0.005076,0.018881,...,0.030056,-0.026965,-0.002426,0.042282,0.024082,0.004498,-0.009517,-0.005315,-0.003784,-0.010202


### Mine: Select category ids corresponding to pages from database

In [7]:
page_vectors_df.index

Int64Index([ 4341789, 48201744,  2514975, 35135520, 27303975,    24626,
            11313214, 34709570,   548937, 23607991,
            ...
              556970, 41410502, 37019597, 16105186, 37019370, 10378988,
            16105212,   146048,   638898, 37019616],
           dtype='int64', length=2270)

In [8]:
category_id_list = []
for index in page_vectors_df.index:
    category_id = db.select_categories_for_page(index)
    category_id_list.append(category_id[0][1])

Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server 

In [9]:
target_vector_df = pd.Series(category_id_list)
target_vector_df.head()

0      695196
1      716903
2     1489690
3      695196
4    34955640
dtype: int64

In [10]:
feature_matrix_df = page_vectors_df

In [11]:
target_vector_df.to_pickle('target_vector.pickle')
feature_matrix_df.to_pickle('feature_matrix.pickle')

### Refine: Create a data dictionary with training and testing sets

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [13]:
X = pd.read_pickle("feature_matrix.pickle")
y = pd.read_pickle("target_vector.pickle")

In [14]:
lab_enc_y = LabelEncoder()
y = lab_enc_y.fit_transform(y)

In [23]:
joblib.dump(lab_enc_y, "encoded_ys.pickle")

['encoded_ys.pickle']

In [15]:
X.shape, y.shape

((2270, 500), (2270,))

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

data_dictionary = {
    'X'       : X,
    'y'       : y,
    'X_train' : X_train,
    'X_test'  : X_test,
    'y_train' : y_train,
    'y_test'  : y_test,
}

### Model: Fit, Score, and Tune a multi-label classification Model

In [17]:
from sklearn.naive_bayes import GaussianNB

this_naive_classifier = GaussianNB()

this_naive_classifier.fit(data_dictionary['X_train'], 
                          data_dictionary['y_train'])

train_score = this_naive_classifier.score(data_dictionary['X_train'],
                                          data_dictionary['y_train'])

test_score = this_naive_classifier.score(data_dictionary['X_test'],
                                         data_dictionary['y_test'])

print('train: {} test: {}'.format(train_score, test_score))

train: 0.924794359577 test: 0.871478873239


### Present: Pickle tuned model for later use

In [18]:
import pickle

In [19]:
from sklearn.externals import joblib

In [20]:
joblib.dump(this_naive_classifier, 'my_pickled_model.pickle')

['my_pickled_model.pickle']

### Reload Pickled Model

In [21]:
from_pkl_cls = joblib.load("my_pickled_model.pickle")

In [22]:
from_pkl_cls.score(data_dictionary['X_train'],
                   data_dictionary['y_train'])

0.92479435957696832