In [1]:
import json
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from text_unidecode import unidecode
## this is a muticlass classification problem
verbose = 0 ## print updates or not - boolean



In [2]:
if verbose:
    print 'all modules imported'

##################################################################
## STEP 1 - READING AND CLEANING DATASETS
##################################################################
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

## check the shape of training and test dataset
if verbose:
    print 'size of training dataset is', train_df.shape
    print 'size of test dataset is', test_df.shape

In [4]:
## read the ingredient list and do some cleaning
## remove digits and lower the characters. strip any whitespaces if present
all_ingredients_train = []
all_cuisines = []
for i, row in train_df.iterrows():
    all_ingredients_train.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    all_cuisines.append(row['cuisine'])

all_ingredients_test= []
for i, row in test_df.iterrows():
    all_ingredients_test.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    
## remove special characters from ingredients
all_ingredients_train = [ ing.replace("-", " ").replace("&", " ").replace("'", " ").replace("''", " ").replace("%", " ")\
                    .replace("!", " ").replace("(", " ").replace(")", " ").replace("/", " ").replace("/", " ")\
                    .replace(",", " ").replace(".", " ") for ing in all_ingredients_train]

## remove extra whitespaces
all_ingredients_train = [ re.sub('\s+', ' ', ing).strip() for ing in all_ingredients_train]

## number of unique ingredients and cuisine in the dataset
if verbose:
    print 'total number of ingedients are', len(set(all_ingredients))
    print 'total number of cusines are', len(set(all_cuisines))

In [5]:
print train_df.shape
print len(all_ingredients_train)
print test_df.shape
print len(all_ingredients_test)

(39774, 3)
39774
(9944, 2)
9944


In [6]:
## initialize tfidf vectorizer and label encoder
tfidf = TfidfVectorizer()
lbl = LabelEncoder()

## fit and transform on the test and train dataset
train = tfidf.fit_transform(all_ingredients_train).astype('float32')
y = lbl.fit_transform(all_cuisines)

test = tfidf.transform(all_ingredients_test).astype('float32')

In [7]:
nfolds = 5 ## use 5-fold cross validation to check the best parameters
n_estimators_values = range(100,1000,100)
max_features_values = ['sqrt', 'log2']
max_depth_values = range(4, 10, 3)
min_samples_split_values = range(3, 7, 2)
min_samples_leaf_values = range(1, 3, 2)
param_grid = {'n_estimators': n_estimators_values, 'max_features' : max_features_values,\
              'max_depth':max_depth_values, 'min_samples_split': min_samples_split_values,\
               'min_samples_leaf':min_samples_leaf_values}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=nfolds)
grid_search.fit(train, y)
print grid_search.best_params_

{'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500, 'max_depth': 7, 'min_samples_leaf': 1}


In [8]:
model = RandomForestClassifier(n_estimators=500, max_features='sqrt', \
                             max_depth=7, min_samples_split=5, min_samples_leaf=1,\
                             verbose=True, random_state=1, oob_score = True, class_weight='balanced_subsample')

In [9]:
## fit the model
model.fit(train,y)

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.4s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=7, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=True, random_state=1, verbose=True, warm_start=False)

In [10]:
## predict using the model
cuisine_pred = model.predict(test)
cuisine_pred_labels = lbl.inverse_transform(cuisine_pred)
## take the id from the test dataframe
ids = test_df['id']

## make a submission file
output = pd.DataFrame({'id': ids, 'cuisine': cuisine_pred_labels}, columns=['id', 'cuisine'])
output.to_csv('random_forest_submission.csv', index=False)

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.3s finished


In [11]:
model.feature_importances_

The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
        0.        ,  0.00012988])

In [15]:
important_features = pd.Series(data=model.feature_importances_,index=tfidf.get_feature_names())
important_features.sort_values(ascending=False,inplace=True)

In [16]:
important_features

soy           0.036080
sesame        0.034731
fish          0.029654
sauce         0.028493
cumin         0.023670
olive         0.023118
tortillas     0.022256
lime          0.021804
feta          0.019691
cilantro      0.018670
thyme         0.017235
mirin         0.017011
garam         0.016601
allspice      0.016369
cheese        0.015951
ginger        0.015873
masala        0.015764
coconut       0.012860
curry         0.012193
cachaca       0.011600
rice          0.011333
cajun         0.011228
seasoning     0.009927
coriander     0.009597
cinnamon      0.009360
gochujang     0.009321
oregano       0.009148
seeds         0.008962
thai          0.008886
sake          0.008819
                ...   
master        0.000000
masur         0.000000
matcha        0.000000
matsutake     0.000000
mature        0.000000
matzos        0.000000
maui          0.000000
mayer         0.000000
mayonnais     0.000000
mccormick     0.000000
mcintosh      0.000000
meatballs     0.000000
meatloaf   