In [1]:
import pickle
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from collections import defaultdict
from scipy.sparse import csr_matrix

In [2]:
with open('../data/yelp_training.pkl', 'rb') as out:
    T = pickle.load(out)

In [3]:
for k, v in T.items():
    print(k, [type(x) for x in v])
    break

raw_text_tfidf [<class 'numpy.ndarray'>, <class 'list'>, <class 'sklearn.feature_extraction.text.CountVectorizer'>, <class 'sklearn.feature_extraction.text.TfidfTransformer'>]


In [4]:
D = pd.read_csv('../data/yelp_example_1_small.tsv', sep='\t')

In [5]:
D.head(2)

Unnamed: 0,content,score,business,avgstars
0,This place is WAAAY over priced for the generi...,1,Lee's Buffet,2.0
1,Our taxi driver had told us to go to this plac...,5,Village Pub and Cafe,3.5


### Create training and test

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
training = {}
data = tqdm_notebook(T.items())
for k, v in data:
    X = csr_matrix(v[0])
    y = [D.loc[int(x)].score for x in v[1]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    training[k] = (X_train, X_test, y_train, y_test)

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




## Training classification models

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [9]:
classifiers = {'DTC': DecisionTreeClassifier(), 'KNN': KNeighborsClassifier()}

In [10]:
trained = defaultdict(lambda: {})
experiments = tqdm_notebook(training.items())
for k, (x_train, x_test, y_train, y_test) in experiments:
    for cl, model in classifiers.items():
        m = model.__class__()
        m.fit(x_train, y_train)
        trained[k][cl] = m

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




### DNN

In [11]:
from sklearn.preprocessing import OneHotEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [12]:
def get_dnn(x_train):
    model = Sequential()
    model.add(Dense(100, input_dim=x_train.shape[1], activation='relu'))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
experiments = tqdm_notebook(training.items())
for k, (x_train, x_test, y_train, y_test) in experiments:
    y_e = OneHotEncoder().fit_transform(np.array(y_train).reshape(-1, 1))
    m = get_dnn(x_train)
    m.fit(x_train, y_e, batch_size=50, epochs=6, verbose=0)
    trained[k]['DNN'] = m

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




## Save

In [14]:
with open('../data/yelp_classification_training.pkl', 'wb') as out:
    pickle.dump(training, out)

In [15]:
to_save = {}
for k, v in trained.items():
    s = {}
    for model_name, model in v.items():
        if model_name == 'DNN':
            m_json = model.to_json()
            model.save_weights("../data/{}_{}.h5".format(k, model_name))
            s['DNN'] = m_json
        else:
            s[model_name] = model
    to_save[k] = s

In [16]:
with open('../data/yelp_classification_experiments.pkl', 'wb') as out:
    pickle.dump(to_save, out)

## Exercize: do the same process for regression