In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from ast import literal_eval
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import csv
import random

In [2]:
# Reading the clean data
file = 'fully_encoded_recipes_with_cuisine_from_SP.csv'
recipe_df = pd.read_csv(file, index_col=0)
recipe_df

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
5,8,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,11,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9978,11,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9981,10,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9984,10,0,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9988,11,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
encoding_key = pd.read_csv('cuisine_encoding_dict.csv', index_col=0)
encoding_key

Unnamed: 0,African,American,Asian,Creole,English,European,French,Greek,Indian,Irish,Italian,Mexican,Middle Eastern,Southern,Spanish
encoded,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14


## Data are processed to dataframes with and without response variables (cuisines) and the response variables are encoded to variables

In [4]:
mexican_df = recipe_df[recipe_df.cuisine_SP==11].copy()
non_mexican_df = recipe_df[recipe_df.cuisine_SP!=11]
non_mexican_df_indexes = pd.DataFrame(non_mexican_df.index.values, columns=['recipe_num'])
non_mexican_df_indexes

Unnamed: 0,recipe_num
0,5
1,16
2,17
3,20
4,23
...,...
2534,9968
2535,9970
2536,9981
2537,9984


In [5]:
index_for_balance = set()
while len(index_for_balance)<=len(mexican_df):
    index_for_balance.add(random.randint(0,2538))
index_for_balance = list(index_for_balance)
non_mexican_df_balance = non_mexican_df.iloc[index_for_balance]
non_mexican_df_balance
    

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
8068,13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50,13,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8078,8,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,13,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8085,7,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7939,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7942,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7947,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
mexican_df_balanced = pd.concat([mexican_df, non_mexican_df_balance])

In [7]:
mexican_df_balanced

Unnamed: 0,cuisine_SP,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
10,11,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,11,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,11,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
80,11,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
95,11,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7939,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7942,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7947,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
mexican_df_balanced['response'] = 0
mexican_df_balanced.response = mexican_df_balanced.response.mask(mexican_df_balanced.cuisine_SP==11, 1)
mexican_df_balanced.drop(columns='cuisine_SP', inplace=True)

In [9]:
#Define the features set
X = mexican_df_balanced.copy()
X = X.drop("response", axis=1)
X

Unnamed: 0,onion,garlic,vanilla,lemon,bell pepper,tomato,chocolate,mushrooms,italian cheese,parsley,...,harissa,sprouts,squash blossoms,grapefruit,tamarind pulp,savory,baharat,douchi,sucralose,jicama
10,0,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
53,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
80,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
95,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7927,1,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
7939,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7942,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7947,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# define the target

y = mexican_df_balanced["response"]
y

10      1
53      1
56      1
80      1
95      1
       ..
7927    0
7939    0
7942    0
7947    0
7987    0
Name: response, Length: 827, dtype: int64

In [11]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [14]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)


In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
italian_cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], 
    columns=["Predicted 0", "Predicted 1"])
    
italian_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,98,2
Actual 1,0,107


In [16]:
importance = rf_model.feature_importances_

In [17]:
importance_df = pd.DataFrame(importance, index=X.columns.values, columns=['feature_importance'])
importance_df = importance_df.sort_values('feature_importance', ascending=False)

In [18]:
importance_df.iloc[0:10,:]

Unnamed: 0,feature_importance
lime,0.130017
tomato,0.047152
onion,0.037159
corn,0.034509
hot sauce or salsa,0.03181
cilantro,0.030591
avocado,0.029692
chili powder,0.028419
mushrooms,0.027907
enchilada sauce,0.024772
