In [8]:
import json
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from text_unidecode import unidecode
import xgboost as xgb
from xgboost.sklearn import XGBClassifier 

In [9]:
if verbose:
    print 'all modules imported'

##################################################################
## STEP 1 - READING AND CLEANING DATASETS
##################################################################
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

## check the shape of training and test dataset
if verbose:
    print 'size of training dataset is', train_df.shape
    print 'size of test dataset is', test_df.shape

In [10]:
## read the ingredient list and do some cleaning
## remove digits and lower the characters. strip any whitespaces if present
all_ingredients_train = []
all_cuisines = []
for i, row in train_df.iterrows():
    all_ingredients_train.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    all_cuisines.append(row['cuisine'])

all_ingredients_test= []
for i, row in test_df.iterrows():
    all_ingredients_test.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    
## remove special characters from ingredients
all_ingredients_train = [ ing.replace("-", " ").replace("&", " ").replace("'", " ").replace("''", " ").replace("%", " ")\
                    .replace("!", " ").replace("(", " ").replace(")", " ").replace("/", " ").replace("/", " ")\
                    .replace(",", " ").replace(".", " ") for ing in all_ingredients_train]

## remove extra whitespaces
all_ingredients_train = [ re.sub('\s+', ' ', ing).strip() for ing in all_ingredients_train]

## number of unique ingredients and cuisine in the dataset
if verbose:
    print 'total number of ingedients are', len(set(all_ingredients))
    print 'total number of cusines are', len(set(all_cuisines))

In [11]:
print train_df.shape
print len(all_ingredients_train)
print test_df.shape
print len(all_ingredients_test)

(39774, 3)
39774
(9944, 2)
9944


In [12]:
## initialize tfidf vectorizer and label encoder
tfidf = TfidfVectorizer()
lbl = LabelEncoder()

## fit and transform on the test and train dataset
train = tfidf.fit_transform(all_ingredients_train).astype('float32')
y = lbl.fit_transform(all_cuisines)

test = tfidf.transform(all_ingredients_test).astype('float32')

In [13]:
nfolds = 5 ## use 5-fold cross validation to check the best parameters

## first step is to get the number of estimators. this training will stop once
## the error stops reducing
xgtrain = xgb.DMatrix(train, label=y)

In [33]:
param_dist = {'n_estimators': 500,
              'learning_rate': 0.03,
              'subsample': 0.7,
              'max_depth':  7,
              'colsample_bytree': 0.8,
              'min_child_weight': 2,
              'objective': 'multi:softmax',
               'num_class': 20
             }

In [34]:
cvresult = xgb.cv(param_dist, xgtrain, num_boost_round=2000, nfold=nfolds,
            metrics='merror', early_stopping_rounds=50, stratified = True)

## optimum number of trees is 249

In [35]:
print cvresult.shape

(884, 4)


In [None]:
## now lets start tuning the parameters
max_depth_values = range(4, 10, 3)
min_child_weight_values = [6,8,10,12]
subsample_values = [i/10.0 for i in range(6,10)],
colsample_bytree_values = [i/10.0 for i in range(6,10)]
gamma_values = [i/10.0 for i in range(0,5)]
param_grid = {'max_depth':max_depth_values, 'min_child_weight': min_child_weight_values,\
               'subsample':subsample_values, 'colsample_bytree': colsample_bytree_values,\
              'gamma':gamma_values}
grid_search = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=884, \
                                                      objective= 'multi:softmax', scale_pos_weight=1,seed=1), \
                           param_grid = param_grid, scoring='roc_auc',n_jobs=-1,iid=False, cv=nfolds)
grid_search.fit(train, y)
print grid_search.best_params_