In [1]:
import pickle
import flask
import json
import numpy as np
import pandas as pd
from numpy import linalg as LA
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn import metrics

In [2]:
train_df_bow = pd.read_json('../data/train/trainText-75-25-bow.json', lines=True)
test_df_bow = pd.read_json('../data/test/testText-75-25-bow.json', lines=True)

In [3]:
X_test_bow = list(test_df_bow['recipe'])
y_test_bow = list(test_df_bow['cuisine'])

In [4]:
#loading
j_sim_matrix = np.load('j_matrix.npy')
gj_sim_matrix = np.load('gj_matrix.npy')
cos_sim_matrix = np.load('cos_matrix.npy')
comb_sim_matrix = np.load('comb_matrix.npy')

with open('cuis_invidx.json') as json_file:  
    cuis_invidx = json.load(json_file)
    
n_cuisines = len(cuis_invidx)
cuisine_types = sorted(list(cuis_invidx.keys()))

In [5]:
# EVALUATION
def eval(proba, test, sim_matrix, sim_cutoff):
    eval_scores = []
    eval_list = np.zeros(n_cuisines)
    counter_list = np.zeros(n_cuisines)
    for r in range(len(test)):
        score = 0
        actual = test[r]
        for c in range(n_cuisines):
            predicted = cuisine_types[c]
            similarity = sim_matrix[cuis_invidx[predicted], cuis_invidx[actual]]
            if similarity == 0.0:
                similarity = 1.0
            if similarity >= sim_cutoff:
                score += similarity*(proba[r,c])
        eval_list[cuis_invidx[actual]] += score
        counter_list[cuis_invidx[actual]] += 1
        eval_scores.append(score)
    for i in range(len(eval_list)):
        accuracy_list = np.divide(eval_list, counter_list)
    accuracy = sum(eval_scores) / len(eval_scores)
    return accuracy_list,accuracy

### Logistic Regression v5
##### Text Preprocessing
Remove nums and stopwords
##### CountVectorizer
n-gram = (1,4), min_df = 0.003, 1819 total features

In [6]:
avg_sim_j = j_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_gj = gj_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_cos = cos_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
avg_sim_comb = comb_sim_matrix.sum()/(n_cuisines**2-n_cuisines)
print("Average Similarity (Jaccard): " + str(avg_sim_j))
print("Average Similarity (Gen. Jaccard): " + str(avg_sim_gj))
print("Average Similarity (Cosine Sim): " + str(avg_sim_cos))
print("Average Similarity (Combined): " + str(avg_sim_comb))
  
def evaluation(proba):
    print('\t\tJaccard\t\tGen. Jaccard\tCosine Sim\tCombined Sim')
    print('------------------------------------------------------------------------------')
    for c in cuisine_types:
        j = str(round(eval(proba, y_test_bow, j_sim_matrix, avg_sim_j)[0][cuis_invidx[c]]*100,2)) + "%"
        gj = str(round(eval(proba, y_test_bow, gj_sim_matrix, avg_sim_gj)[0][cuis_invidx[c]]*100,2)) + "%"
        cos = str(round(eval(proba, y_test_bow, cos_sim_matrix, avg_sim_cos)[0][cuis_invidx[c]]*100,2)) + "%"
        comb = str(round(eval(proba, y_test_bow, comb_sim_matrix, avg_sim_comb)[0][cuis_invidx[c]]*100,2)) + "%"
        if len(c) < 7:
            print(c + ':\t\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)
        else:
            print(c + ':\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)
    
    print('cajun_creole:\t' + '23.58%' + '\t\t' + '23.58%' + '\t\t' + '23.58%' + '\t\t' + '23.58%')
    print('------------------------------------------------------------------------------')
    j = str(round(eval(proba, y_test_bow, j_sim_matrix, avg_sim_j)[1]*100,2)) + "%"
    gj = str(round(eval(proba, y_test_bow, gj_sim_matrix, avg_sim_gj)[1]*100,2)) + "%"
    cos = str(round(eval(proba, y_test_bow, cos_sim_matrix, avg_sim_cos)[1]*100,2)) + "%"
    comb = str(round(eval(proba, y_test_bow, comb_sim_matrix, avg_sim_comb)[1]*100,2)) + "%"
    print('Total' + ':\t\t' + j + '\t\t' + gj + '\t\t' + cos + '\t\t' + comb)

Average Similarity (Jaccard): 0.4063027075009613
Average Similarity (Gen. Jaccard): 0.2884964682760036
Average Similarity (Cosine Sim): 0.630361806153726
Average Similarity (Combined): 0.44172032731023025


In [8]:
LogReg = pickle.load(open('../models/Log_Reg_v1/model.sav', 'rb'))
y_proba = LogReg.predict_proba(X_test_bow)
evaluation(y_proba)

  'stop_words.' % sorted(inconsistent))


		Jaccard		Gen. Jaccard	Cosine Sim	Combined Sim
------------------------------------------------------------------------------
brazilian:	33.85%		51.82%		77.14%		59.57%
british:	46.36%		40.31%		68.56%		52.42%
cajun_creole:	75.09%		67.5%		84.08%		75.36%
chinese:	91.55%		88.22%		91.61%		89.75%
filipino:	42.11%		39.75%		76.21%		47.45%
french:		77.49%		66.72%		82.77%		75.58%
greek:		84.48%		81.22%		87.55%		84.16%
indian:		94.6%		90.73%		91.59%		92.82%
irish:		47.6%		52.52%		73.63%		58.76%
italian:	95.24%		93.16%		95.95%		94.61%
jamaican:	46.25%		55.3%		76.64%		61.62%
japanese:	77.8%		75.65%		79.2%		77.49%
korean:		65.92%		65.62%		81.77%		71.1%
mexican:	95.45%		92.44%		94.24%		93.99%
moroccan:	63.76%		64.71%		80.63%		71.16%
russian:	29.74%		40.77%		65.25%		49.1%
southern_us:	72.94%		61.6%		74.47%		70.44%
spanish:	79.36%		76.16%		88.77%		81.45%
thai:		80.97%		73.78%		79.76%		77.8%
vietnamese:	62.38%		58.98%		78.74%		66.45%
cajun_creole:	23.58%		23.58%		23.58%		23.58%
------------------------

In [19]:
y_prob = []
vc = np.array([350,603,1159,9881,566,6614,4629,6354,500,12929,394,4670,622,10798,615,366,3240,3752,5115,618])
vc = vc/73775
for i in range(len(X_test_bow)):
    y_prob.append(vc)
evaluation(np.array(y_prob))

		Jaccard		Gen. Jaccard	Cosine Sim	Combined Sim
------------------------------------------------------------------------------
brazilian:	0.7%		23.9%		57.88%		34.24%
british:	16.84%		7.53%		27.34%		18.07%
cajun_creole:	41.51%		22.99%		51.12%		37.25%
chinese:	48.7%		20.2%		25.35%		22.52%
filipino:	13.06%		11.43%		63.56%		20.29%
french:		49.59%		24.95%		40.69%		39.19%
greek:		35.9%		26.79%		44.02%		35.11%
indian:		51.02%		15.22%		25.03%		34.09%
irish:		7.19%		14.77%		28.05%		18.43%
italian:	56.52%		34.16%		52.28%		47.16%
jamaican:	4.93%		18.5%		51.11%		22.79%
japanese:	28.82%		15.73%		23.48%		23.1%
korean:		13.55%		12.81%		23.27%		16.54%
mexican:	57.34%		28.59%		43.02%		42.28%
moroccan:	14.29%		22.06%		45.78%		31.29%
russian:	4.9%		14.91%		31.54%		20.84%
southern_us:	51.44%		23.59%		32.39%		35.55%
spanish:	36.6%		30.16%		55.4%		40.56%
thai:		39.78%		15.45%		22.52%		22.9%
vietnamese:	19.24%		13.02%		22.97%		17.09%
cajun_creole:	23.58%		23.58%		23.58%		23.58%
------------------------------