In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from ast import literal_eval
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import csv
import random

In [2]:
# Reading the clean data
file = 'fully_encoded_recipes_with_cuisine_from_SP.csv'
recipe_df = pd.read_csv(file, index_col=0)
recipe_df

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
5,8,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,11,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9978,11,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9981,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9984,10,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9988,11,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
encoding_key = pd.read_csv('cuisine_encoding_dict.csv', index_col=0)
encoding_key

Unnamed: 0,African,American,Asian,Creole,English,European,French,Greek,Indian,Irish,Italian,Mexican,Middle Eastern,Southern,Spanish
encoded,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14


## Data are processed to dataframes with and without response variables (cuisines) and the response variables are encoded to variables

In [4]:
asian_df = recipe_df[recipe_df.cuisine_SP==2].copy()
non_asian_df = recipe_df[recipe_df.cuisine_SP!=2]
non_asian_df_indexes = pd.DataFrame(non_asian_df.index.values, columns=['recipe_num'])
non_asian_df_indexes

Unnamed: 0,recipe_num
0,5
1,10
2,17
3,20
4,23
...,...
2620,9978
2621,9981
2622,9984
2623,9988


In [5]:
index_for_balance = set()
while len(index_for_balance)<=len(asian_df):
    index_for_balance.add(random.randint(0,2624))
index_for_balance = list(index_for_balance)
non_asian_df_balance = non_asian_df.iloc[index_for_balance]
non_asian_df_balance
    

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
5,8,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7833,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7835,8,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
57,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7616,13,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7649,10,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7664,10,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7684,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
asian_df_balanced = pd.concat([asian_df, non_asian_df_balance])

In [7]:
asian_df_balanced

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
16,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147,2,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164,2,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7616,13,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7649,10,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7664,10,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7684,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
asian_df_balanced['response'] = 0
asian_df_balanced.response = asian_df_balanced.response.mask(asian_df_balanced.cuisine_SP==2, 1)
asian_df_balanced.drop(columns='cuisine_SP', inplace=True)

In [9]:
#Define the features set
X = asian_df_balanced.copy()
X = X.drop("response", axis=1)
X

Unnamed: 0,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,parsley,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
16,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7616,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7649,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7664,0,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7684,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# define the target

y = asian_df_balanced["response"]
y

16      1
29      1
78      1
147     1
164     1
       ..
7616    0
7649    0
7664    0
7684    0
7690    0
Name: response, Length: 655, dtype: int64

In [11]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [14]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)


In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
italian_cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], 
    columns=["Predicted 0", "Predicted 1"])
    
italian_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,74,1
Actual 1,0,89


In [16]:
importance = rf_model.feature_importances_

In [17]:
importance_df = pd.DataFrame(importance, index=X.columns.values, columns=['feature_importance'])
importance_df = importance_df.sort_values('feature_importance', ascending=False)

In [18]:
importance_df.iloc[0:10,:]

Unnamed: 0,feature_importance
soy sauce,0.12823
rice,0.096862
carrots,0.065263
tomato,0.04024
fish sauce,0.033152
gochujang,0.031837
ginger,0.030648
garlic,0.028449
corn,0.026199
wonton wrappers,0.023608
