In [12]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC, SVC
from sklearn import linear_model, tree, neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from nltk import pos_tag
import time
from model import*
from analyze_words import get_df_idf_stops
import joblib

In [2]:
from analyze_words import *

In [3]:
def applyModels(model, x_train, y_train):
    print(model)
    model.fit(x_train, y_train)
    return model


def predictModel(model, x_test):
    prediction = model.predict(x_test)
    return prediction

In [17]:
%%time
df, idf, stop_words = get_df_idf_stops('test_data/merged_data.csv', n=1, lemmatized=False,num_stop_words=0)

CPU times: user 34.6 s, sys: 2.79 s, total: 37.4 s
Wall time: 38.5 s


In [5]:
def transformFeatureSelection(model, x):
    return model.transform(x)

def applyFeatureSelection(model, x_train, y_train):
    model = model.fit(x_train, y_train)
    return model

In [18]:

comb = {"ngram": 1, "lemmatize": False, "stop_word": 0}

(x_train, x_test, y_train, y_test) = train_test_split(df.drop("Rating", axis=1),
                                         df.Rating,
                                         test_size=0.95,
                                         random_state=33)

# Feature Selection
model = linear_model.SGDClassifier(alpha=0.1)
trained_model = applyModels(model, x_train, y_train)
feature_selection_model = SelectFromModel(trained_model)
trained_feature_selection_model = applyFeatureSelection(feature_selection_model,
                                                        x_train, y_train)
x_train = transformFeatureSelection(trained_feature_selection_model,
                                    x_train)
x_test = transformFeatureSelection(trained_feature_selection_model,
                                   x_test)

final_model = applyModels(model, x_train, y_train)
prediction = predictModel(final_model, x_test)

print("Final Model Classification Report")
print(classification_report(prediction, y_test))
print("Accuracy Score")
print(evaluateModel(prediction, y_test))

# Save best Model
joblib.dump(final_model, "optimal_args/final_model.pkl")

# Save best columns, idf, combination, and stop words
feature_idx = trained_feature_selection_model.get_support()
column_names = df.drop("Rating", axis=1).columns[feature_idx]
with open('optimal_args/columns.json', 'w') as f:
    json.dump(list(column_names), f)
with open('optimal_args/idf.json', 'w') as f:
    json.dump(idf, f)
with open('optimal_args/combination.json', 'w') as f:
    json.dump(comb, f)
with open('optimal_args/num_stop_words.json', 'w') as f:
    json.dump(stop_words, f)

Final Model Classification Report
              precision    recall  f1-score   support

           1       1.00      0.95      0.97      1973
           2       0.96      0.99      0.98      1855
           3       0.88      0.94      0.91      1786
           4       0.78      0.74      0.76      1981
           5       0.77      0.77      0.77      1905

    accuracy                           0.88      9500
   macro avg       0.88      0.88      0.88      9500
weighted avg       0.88      0.88      0.87      9500

Accuracy Score
0.9601052631578948


In [50]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("Rating", axis=1),df.Rating,test_size=0.9)
model = linear_model.SGDClassifier()

In [51]:
trained_model = applyModels(model, x_train, y_train)

SGDClassifier()


In [52]:
feature_selection_model = SelectFromModel(trained_model)
trained_feature_selection_model = applyFeatureSelection(feature_selection_model,
                                                        x_train, y_train)
x_train = transformFeatureSelection(trained_feature_selection_model,
                                    x_train)
x_test = transformFeatureSelection(trained_feature_selection_model,
                                   x_test)

In [53]:
x_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.36605709, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [54]:
feature_idx = trained_feature_selection_model.get_support()

In [55]:
feature_name = df.drop("Rating", axis=1).columns[feature_idx]

In [56]:
final_model = applyModels(model, x_train, y_train)

SGDClassifier()


In [57]:
prediction = predictModel(final_model, x_test)

In [85]:
np.mean(prediction == y_test)


0.9264444444444444

In [93]:
pred_test_df = pd.DataFrame({'predict': prediction,
                             'actual': y_test}).astype('int')
pred_test_df['difference'] = (pred_test_df.predict -
                              pred_test_df.actual).abs()
pred_test_df['weighted_difference'] = pred_test_df['difference'] * \
    pred_test_df['difference']

num_tests = len(pred_test_df.index)
total_deviance = pred_test_df['weighted_difference'].sum()

# maximum deviance is 4 (5 star rating vs 1 star rating)
weighted_accuracy = 1 - (total_deviance / (4 * 4 * num_tests))
weighted_accuracy


0.9861041666666667

In [98]:

def evaluateModel(prediction, y_test):
    '''
    Calculate the accuracy of model based on the proportion of accurate 
    predictions using the testing data. Calculation is weighted by the degree of
    deviance from 'actual rating'.
    '''
    # convert into pandas dataframe for easier handling
    pred_test_df = pd.DataFrame({'predict': prediction,
                                 'actual': y_test}).astype('int')
    pred_test_df['difference'] = (pred_test_df.predict -
                                  pred_test_df.actual).abs()

    num_tests = len(pred_test_df.index)
    total_deviance = pred_test_df['difference'].sum()

    # maximum deviance is 4 (5 star rating vs 1 star rating)
    weighted_accuracy = 1 - (total_deviance / (4 * num_tests))

    return weighted_accuracy

evaluateModel(prediction, y_test)

0.9724166666666667

In [99]:
ngrams = [1, 2, 3, 4, 5]
lemmatizes = [True, False]
stop_words = [0, 10, 20]
alphas = [0.0001, 0.001, 0.01, 0.1, 1]


import itertools
list(itertools.product(ngrams,lemmatizes, stop_words,alphas))

[(1, True, 0, 0.0001),
 (1, True, 0, 0.001),
 (1, True, 0, 0.01),
 (1, True, 0, 0.1),
 (1, True, 0, 1),
 (1, True, 10, 0.0001),
 (1, True, 10, 0.001),
 (1, True, 10, 0.01),
 (1, True, 10, 0.1),
 (1, True, 10, 1),
 (1, True, 20, 0.0001),
 (1, True, 20, 0.001),
 (1, True, 20, 0.01),
 (1, True, 20, 0.1),
 (1, True, 20, 1),
 (1, False, 0, 0.0001),
 (1, False, 0, 0.001),
 (1, False, 0, 0.01),
 (1, False, 0, 0.1),
 (1, False, 0, 1),
 (1, False, 10, 0.0001),
 (1, False, 10, 0.001),
 (1, False, 10, 0.01),
 (1, False, 10, 0.1),
 (1, False, 10, 1),
 (1, False, 20, 0.0001),
 (1, False, 20, 0.001),
 (1, False, 20, 0.01),
 (1, False, 20, 0.1),
 (1, False, 20, 1),
 (2, True, 0, 0.0001),
 (2, True, 0, 0.001),
 (2, True, 0, 0.01),
 (2, True, 0, 0.1),
 (2, True, 0, 1),
 (2, True, 10, 0.0001),
 (2, True, 10, 0.001),
 (2, True, 10, 0.01),
 (2, True, 10, 0.1),
 (2, True, 10, 1),
 (2, True, 20, 0.0001),
 (2, True, 20, 0.001),
 (2, True, 20, 0.01),
 (2, True, 20, 0.1),
 (2, True, 20, 1),
 (2, False, 0, 0.00

In [16]:
import json

In [17]:
comb = {"ngram": 2, "lemmatize": True, "stop_word": 20}

In [18]:
with open('columns.json', 'w') as f:
        json.dump(list(feature_name), f)
with open('idf.json', 'w') as f:
    json.dump(idf, f)
with open('combination.json', 'w') as f:
    json.dump(comb, f)
with open('stop_words.json', 'w') as f:
    json.dump(stop_words, f)

In [19]:
from main import process_input

In [61]:
%%time
x_array = process_input("""Due to COVID we decided to enjoy an at home Valentine's Day dinner via a meal kit. Sadly it was a mistake. It was not a meal kit for 4 as described online and in their instructions, but rather 2.

Total Score: 2.4/5
Ease of Pickup: 2/5 Naperville-They sat in their truck playing on their phones waiting to set up and made us wait 20 minutes. Finally someone demanded theirs so we got ours too.
Quality of ingredients: 5/5
Quality of prepared food: 3/5 - it was akin to take and bake pizza.
Accuracy of kit: 1/5 - Only received pig face for two with a total weight of 9 ounces and  two chocolate chip cookies. The empanadas and green beans were as described.
Value: 1/5

I contacted the restaurant. They told me the ingredients I received were correct except I should have received 4 cookies. They also said it had changed. When? Before or after The Girl and The Goat printed the instructions, advertised the meal kit, or I paid $133 for a 4 person dinner. To their credit they did refund my money. Such a disappointing experience from one of my former favorite restaurants""")

CPU times: user 46.8 ms, sys: 4.56 ms, total: 51.3 ms
Wall time: 51.9 ms


In [62]:
feature_name[x_array.nonzero()]

Index(['got', 'so', 'or', 'after', 'two', 'have', 'received', 'refund',
       'waiting', 'an',
       ...
       'phones', 'their phones', 'covid decided', 'credit', 'face', 'accuracy',
       'so got', 'theirs', 'prepared food', 'score'],
      dtype='object', length=106)

In [63]:
len(x_array)

26360

In [64]:
x_test.shape

(2000, 26360)

In [65]:
prediction = predictModel(final_model, [x_array])

In [66]:
prediction

array([1])

In [42]:
def forward_selection(df):
    '''
    Given a dataset with P predictor variables, uses forward selection to
    select models for every value of K between 1 and P.

    Inputs:
        dataset: (DataSet object) a dataset

    Returns:
        A list (of length P) of Model objects. The first element is the
        model where K=1, the second element is the model where K=2, and so on.
    '''

    models = []
    pred_vars_avail = dataset.pred_vars[:]
    pred_vars_used = []
    num_vars = len(dataset.pred_vars)

    for _ in range(num_vars):
        max_R2 = 0
        max_model = None
        max_variable = None
        for var in pred_vars_avail:
            model = Model(dataset, pred_vars_used + [var])
            if model.R2 > max_R2:
                max_model = model
                max_R2 = model.R2
                max_variable = var
        pred_vars_used.append(max_variable)
        pred_vars_avail.remove(max_variable)
        models.append(max_model)

    return models