In [124]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from ast import literal_eval
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import csv
import random

In [88]:
# Reading the clean data
file = 'fully_encoded_recipes_with_cuisine_from_SP.csv'
recipe_df = pd.read_csv(file, index_col=0)
recipe_df

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
5,8,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,11,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9978,11,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9981,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9984,10,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9988,11,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
encoding_key = pd.read_csv('cuisine_encoding_dict.csv', index_col=0)
encoding_key

Unnamed: 0,African,American,Asian,Creole,English,European,French,Greek,Indian,Irish,Italian,Mexican,Middle Eastern,Southern,Spanish
encoded,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14


## Data are processed to dataframes with and without response variables (cuisines) and the response variables are encoded to variables

In [123]:
italian_df = recipe_df[recipe_df.cuisine_SP==10].copy()
non_italian_df = recipe_df[recipe_df.cuisine_SP!=10]
non_italian_df_indexes = pd.DataFrame(non_italian_df.index.values, columns=['recipe_num'])
non_italian_df_indexes

Unnamed: 0,recipe_num
0,5
1,10
2,16
3,17
4,27
...,...
2507,9961
2508,9963
2509,9978
2510,9988


In [131]:
index_for_balance = set()
while len(index_for_balance)<=440:
    index_for_balance.add(random.randint(0,2511))
index_for_balance = list(index_for_balance)
non_italian_df_balance = non_italian_df.iloc[index_for_balance]
non_italian_df_balance
    

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama,response
8145,1,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8149,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,11,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8199,13,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8073,11,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8079,3,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8124,13,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8139,11,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
italian_df_balanced = pd.concat([italian_df, non_italian_df_balance])

In [134]:
italian_df_balanced

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama,response
20,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
23,10,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
81,10,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
82,10,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
87,10,1,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8073,11,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8079,3,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8124,13,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8139,11,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
type(italian_df_balanced.cuisine_SP[20])

numpy.int64

In [144]:
italian_df_balanced['response'] = 0
italian_df_balanced.response = italian_df_balanced.response.mask(italian_df_balanced.cuisine_SP==10, 1)
italian_df_balanced.drop(columns='cuisine_SP', inplace=True)

In [146]:
#Define the features set
X = italian_df_balanced.copy()
X = X.drop("response", axis=1)
X

Unnamed: 0,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,parsley,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
20,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
23,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
81,1,0,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
82,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
87,1,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8073,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8079,1,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8124,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8139,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
# define the target

y = italian_df_balanced["response"]
y

20      1
23      1
81      1
82      1
87      1
       ..
8073    0
8079    0
8124    0
8139    0
8140    0
Name: response, Length: 881, dtype: int64

In [148]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [149]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [150]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [151]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)


In [152]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
italian_cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], 
    columns=["Predicted 0", "Predicted 1"])
    
italian_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,105,3
Actual 1,0,113


In [153]:
importance = rf_model.feature_importances_

In [163]:
importance_df = pd.DataFrame(importance, index=X.columns.values, columns=['feature_importance'])
importance_df = importance_df.sort_values('feature_importance', ascending=False)

In [167]:
importance_df.iloc[0:10,:]

Unnamed: 0,feature_importance
italian cheese,0.166733
pasta,0.029957
chili powder,0.028185
basil,0.02456
garlic,0.023339
italian cured meat,0.021532
ladyfingers,0.020401
vanilla,0.01974
lime,0.018776
chocolate,0.018008
