In [1]:
import pickle
import flask
import json
import numpy as np
import pandas as pd
from numpy import linalg as LA
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn import metrics

In [2]:
train_df_bow = pd.read_json('../data/train/trainText-75-25-bow.json', lines=True)
test_df_bow = pd.read_json('../data/test/testText-75-25-bow.json', lines=True)

In [3]:
X_test_bow = list(test_df_bow['recipe'])
y_test_bow = list(test_df_bow['cuisine'])

In [4]:
#loading
j_sim_matrix = np.load('j_matrix.npy')
gj_sim_matrix = np.load('gj_matrix.npy')
cos_sim_matrix = np.load('cos_matrix.npy')
comb_sim_matrix = np.load('comb_matrix.npy')

with open('cuis_invidx.json') as json_file:  
    cuis_invidx = json.load(json_file)
    
n_cuisines = len(cuis_invidx)
cuisine_types = sorted(list(cuis_invidx.keys()))

In [5]:
# EVALUATION
def eval(proba, test, sim_matrix, sim_cutoff):
    eval_scores = []
    eval_list = np.zeros(n_cuisines)
    counter_list = np.zeros(n_cuisines)
    for r in range(len(test)):
        score = 0
        actual = test[r]
        for c in range(n_cuisines):
            predicted = cuisine_types[c]
            similarity = sim_matrix[cuis_invidx[predicted], cuis_invidx[actual]]
            if similarity == 0.0:
                similarity = 1.0
            if similarity >= sim_cutoff:
                score += similarity*(proba[r,c])
        eval_list[cuis_invidx[actual]] += score
        counter_list[cuis_invidx[actual]] += 1
        eval_scores.append(score)
    for i in range(len(eval_list)):
        accuracy_list = np.divide(eval_list, counter_list)
    accuracy = sum(eval_scores) / len(eval_scores)
    return accuracy_list,accuracy

### Logistic Regression v5
##### Text Preprocessing
Remove nums and stopwords
##### CountVectorizer
n-gram = (1,4), min_df = 0.003, 1819 total features

In [6]:
avg_sim_j = j_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_gj = gj_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_cos = cos_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_comb = comb_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
print("Average Similarity (Jaccard): " + str(avg_sim_j))
print("Average Similarity (Gen. Jaccard): " + str(avg_sim_gj))
print("Average Similarity (Cosine Sim): " + str(avg_sim_cos))
print("Average Similarity (Combined): " + str(avg_sim_comb))
  
def evaluation(proba):
    print('\t\tJaccard\t\tGen. Jaccard\tCosine Sim\tCombined Sim')
    print('------------------------------------------------------------------------------')
    for c in cuisine_types:
        j = str(round(eval(proba, y_test_bow, j_sim_matrix, avg_sim_j)[0][cuis_invidx[c]]*100,2)) + "%"
        gj = str(round(eval(proba, y_test_bow, gj_sim_matrix, avg_sim_gj)[0][cuis_invidx[c]]*100,2)) + "%"
        cos = str(round(eval(proba, y_test_bow, cos_sim_matrix, avg_sim_cos)[0][cuis_invidx[c]]*100,2)) + "%"
        comb = str(round(eval(proba, y_test_bow, comb_sim_matrix, avg_sim_comb)[0][cuis_invidx[c]]*100,2)) + "%"
        if len(c) < 7:
            print(c + ':\t\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)
        else:
            print(c + ':\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)
    
    print('cajun_creole:\t' + '23.58%' + '\t\t' + '23.58%' + '\t\t' + '23.58%' + '\t\t' + '23.58%')
    print('------------------------------------------------------------------------------')
    j = str(round(eval(proba, y_test_bow, j_sim_matrix, avg_sim_j)[1]*100,2)) + "%"
    gj = str(round(eval(proba, y_test_bow, gj_sim_matrix, avg_sim_gj)[1]*100,2)) + "%"
    cos = str(round(eval(proba, y_test_bow, cos_sim_matrix, avg_sim_cos)[1]*100,2)) + "%"
    comb = str(round(eval(proba, y_test_bow, comb_sim_matrix, avg_sim_comb)[1]*100,2)) + "%"
    print('Total' + ':\t\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)

Average Similarity (Jaccard): 0.4063027075009613
Average Similarity (Gen. Jaccard): 0.2884964682760036
Average Similarity (Cosine Sim): 0.630361806153726
Average Similarity (Combined): 0.44172032731023025


In [7]:
LogReg_v5 = pickle.load(open('../models/Log_Reg_v1/LogReg_v5.sav', 'rb'))
y_proba = LogReg_v5.predict_proba(X_test_bow)
evaluation(y_proba)

		Jaccard		Gen. Jaccard	Cosine Sim	Combined Sim
------------------------------------------------------------------------------
brazilian:	37.08%		54.12%		78.46%		61.39%
british:	48.91%		42.97%		72.13%		55.46%
cajun_creole:	77.0%		69.72%		85.64%		77.33%
chinese:	93.69%		91.14%		93.66%		92.29%
filipino:	46.41%		44.73%		78.15%		52.07%
french:		76.52%		63.77%		81.62%		73.47%
greek:		85.14%		82.22%		88.77%		85.12%
indian:		95.25%		91.44%		92.02%		93.29%
irish:		49.22%		53.93%		75.11%		60.3%
italian:	96.18%		94.2%		95.95%		95.16%
jamaican:	53.01%		61.27%		79.47%		66.4%
japanese:	79.28%		77.2%		80.48%		78.93%
korean:		71.03%		70.7%		84.24%		75.32%
mexican:	95.69%		92.36%		94.16%		93.83%
moroccan:	65.7%		66.44%		81.84%		72.74%
russian:	33.41%		44.03%		67.62%		52.11%
southern_us:	77.62%		68.49%		79.05%		75.8%
spanish:	79.69%		76.57%		88.81%		81.71%
thai:		82.09%		74.97%		80.29%		78.76%
vietnamese:	65.23%		62.13%		80.0%		68.92%
cajun_creole:	23.58%		23.58%		23.58%		23.58%
-----------------------

In [9]:
y_proba = []
vc = [350,603,1159,9881,566,6614,4629,6354,500,12929,394,4670,622,10798,615,366,3240,3752,5115,618]
vc = [x/73775 for x in vc]
for i in range(len(X_test_bow)):
    y_proba.append(vc)
evaluation(y_proba)

		Jaccard		Gen. Jaccard	Cosine Sim	Combined Sim
------------------------------------------------------------------------------


TypeError: list indices must be integers or slices, not tuple