In [2]:
import json
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.grid_search import GridSearchCV
from text_unidecode import unidecode
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [4]:
verbose=1
if verbose:
    print 'all modules imported'

##################################################################
## STEP 1 - READING AND CLEANING DATASETS
##################################################################
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

## check the shape of training and test dataset
if verbose:
    print 'size of training dataset is', train_df.shape
    print 'size of test dataset is', test_df.shape

all modules imported
size of training dataset is (39774, 3)
size of test dataset is (9944, 2)


In [6]:
## read the ingredient list and do some cleaning
## remove digits and lower the characters. strip any whitespaces if present
all_ingredients_train = []
all_cuisines = []
for i, row in train_df.iterrows():
    all_ingredients_train.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    all_cuisines.append(row['cuisine'])

all_ingredients_test= []
for i, row in test_df.iterrows():
    all_ingredients_test.append(unidecode(re.sub('\d+', '',' '.join(row['ingredients']).lower().strip())))
    
## remove special characters from ingredients
all_ingredients_train = [ ing.replace("-", " ").replace("&", " ").replace("'", " ").replace("''", " ").replace("%", " ")\
                    .replace("!", " ").replace("(", " ").replace(")", " ").replace("/", " ").replace("/", " ")\
                    .replace(",", " ").replace(".", " ") for ing in all_ingredients_train]

## remove extra whitespaces
all_ingredients_train = [ re.sub('\s+', ' ', ing).strip() for ing in all_ingredients_train]

## number of unique ingredients and cuisine in the dataset
if verbose:
    print 'total number of ingedients are', len(set(all_ingredients_train))
    print 'total number of cusines are', len(set(all_cuisines))

 total number of ingedients are 39674
total number of cusines are 20


In [7]:
print train_df.shape
print len(all_ingredients_train)
print test_df.shape
print len(all_ingredients_test)

(39774, 3)
39774
(9944, 2)
9944


In [8]:
## initialize tfidf vectorizer and label encoder
tfidf = TfidfVectorizer()
lbl = LabelEncoder()

## fit and transform on the test and train dataset
train = tfidf.fit_transform(all_ingredients_train).astype('float32')
y = lbl.fit_transform(all_cuisines)

test = tfidf.transform(all_ingredients_test).astype('float32')

In [9]:
nfolds = 5 ## use 5-fold cross validation to check the best parameters

In [10]:
model1 = OneVsRestClassifier(SVC(C=100, kernel='rbf', gamma=0.1, probability=False, tol=0.001, cache_size=200,\
          verbose=True, random_state=1))
## fit the model

In [11]:
## random forest
model2 = RandomForestClassifier(n_estimators=500, max_features='sqrt', \
                             max_depth=7, min_samples_split=5, min_samples_leaf=1,\
                             verbose=True, random_state=1, oob_score = True, class_weight='balanced_subsample')

In [12]:
## first step is to get the number of estimators. this training will stop once
## the error stops reducing
xgtrain = xgb.DMatrix(train, label=y)
model3 = XGBClassifier(learning_rate =0.05, n_estimators=884, max_depth=7, \
                       min_child_weight=8, gamma=0.1, subsample=0.7, colsample_bytree=0.8,\
                       objective= 'multi:softmax', scale_pos_weight=1,seed=1)

In [16]:
n_folds = 5
clfs = [model1, model2, model3]

dataset_blend_train = np.zeros((train.shape[0], len(clfs) ))
dataset_blend_test = np.zeros((test.shape[0], len(clfs)))

In [22]:
dataset_blend_train[:,0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [None]:
for j, clf in enumerate(clfs):
    print j, clf
    clf.fit(train, y)
    dataset_blend_train[:, j] = clf.predict_proba(clf.predict_proba(train))
    dataset_blend_test[:, j] = clf.predict_proba(clf.predict_proba(test))

0 OneVsRestClassifier(estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=True),
          n_jobs=1)
[LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:
print
print "Training Logistic Regression classifier."
# C parameter here set through experimentation.
clf = LogisticRegression(C=10)
clf.fit(dataset_blend_train, y)
y_pred = clf.predict(dataset_blend_test)

In [None]:
cuisine_pred_labels = lbl.inverse_transform(pred)

In [None]:
ids = test_df['id']

## make a submission file
output = pd.DataFrame({'id': ids, 'cuisine': cuisine_pred_labels}, columns=['id', 'cuisine'])
output.to_csv('stacking_submission.csv', index=False)