<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Adding-Cuisines-as-Features" data-toc-modified-id="Adding-Cuisines-as-Features-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Adding Cuisines as Features</a></span></li><li><span><a href="#Adding-City-as-Feature" data-toc-modified-id="Adding-City-as-Feature-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Adding City as Feature</a></span></li><li><span><a href="#Numerical-Feature-Cleaning" data-toc-modified-id="Numerical-Feature-Cleaning-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Numerical Feature Cleaning</a></span></li><li><span><a href="#Other-useful-Derived-Features" data-toc-modified-id="Other-useful-Derived-Features-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Other useful Derived Features</a></span></li><li><span><a href="#Model-Training-on-Complete-Data" data-toc-modified-id="Model-Training-on-Complete-Data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model Training on Complete Data</a></span></li><li><span><a href="#Ensembling-lightgbm-and-RF-model" data-toc-modified-id="Ensembling-lightgbm-and-RF-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Ensembling lightgbm and RF model</a></span></li><li><span><a href="#Second-Layer---Mode-based-Ensemble" data-toc-modified-id="Second-Layer---Mode-based-Ensemble-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Second Layer - Mode based Ensemble</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
pd.set_option("max_columns", 1000)

# Load data
train = pd.read_excel('../input/Data_Train.xlsx', index_col=None)
test = pd.read_excel('../input/Data_Test.xlsx', index_col=None)
print(train.shape)
print(test.shape)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


(11094, 9)
(2774, 8)


In [2]:
target = "Delivery_Time"
train_target = train[target]
train.drop([target], axis=1, inplace=True)

In [3]:
# Converting the classes to integer values. 
# Since this is a multi class classfication problem. The class mapping will be useful when ensembling various models.
train_target = train_target.apply(lambda x: x.split()[0]).astype(int)

class_map = {}
class_map_rev = {}
for a,b in enumerate(sorted(train_target.unique())):
    class_map[b] = a
    class_map_rev[a] = b
print("class mapping {}".format(class_map))
print("class mapping reverse {}".format(class_map_rev))

train_target = train_target.map(class_map)

class mapping {10: 0, 20: 1, 30: 2, 45: 3, 65: 4, 80: 5, 120: 6}
class mapping reverse {0: 10, 1: 20, 2: 30, 3: 45, 4: 65, 5: 80, 6: 120}


In [4]:
alldata = pd.concat([train, test], axis=0, sort=False, ignore_index=True)


def determine_unique_cuisines():
    cuisines_list = [val for val in alldata['Cuisines'].str.split(",")]
    cuisines_list = [",".join([v.strip() for v in val]) for val in cuisines_list]
    unique_cuisines = set(",".join(cuisines_list).split(","))
    print("total unique cuisines {}".format(len(unique_cuisines)))
    print("unique cuisines are {}".format(unique_cuisines))

    return unique_cuisines

unique_cuisines =  determine_unique_cuisines()

total unique cuisines 101
unique cuisines are {'North Indian', 'French', 'Mithai', 'Konkan', 'Bohri', 'Cafe', 'Bakery', 'Turkish', 'North Eastern', 'Mishti', 'Bubble Tea', 'Hot dogs', 'Vietnamese', 'Pizza', 'Japanese', 'Odia', 'Mughlai', 'Tamil', 'Maharashtrian', 'Rolls', 'Momos', 'Bengali', 'Naga', 'Mangalorean', 'Indian', 'Thai', 'Mexican', 'Korean', 'Tibetan', 'Chettinad', 'Biryani', 'Parsi', 'Raw Meats', 'Street Food', 'Middle Eastern', 'Assamese', 'Burger', 'Rajasthani', 'Roast Chicken', 'Tex-Mex', 'South Indian', 'Bangladeshi', 'Ice Cream', 'Goan', 'Malaysian', 'Poké', 'Asian', 'Finger Food', 'Iranian', 'Cantonese', 'Coffee', 'South American', 'Seafood', 'Lucknowi', 'Charcoal Chicken', 'American', 'Bar Food', 'Paan', 'Malwani', 'Kebab', 'Lebanese', 'Belgian', 'Continental', 'Andhra', 'Frozen Yogurt', 'Wraps', 'Italian', 'Kashmiri', 'Bihari', 'African', 'Kerala', 'Sri Lankan', 'Greek', 'Chinese', 'Gujarati', 'Desserts', 'Awadhi', 'Juices', 'Indonesian', 'Healthy Food', 'Brazilian'

# Adding Cuisines as Features

In [5]:
cuisine_vals_list = []
for i, row in tqdm(alldata.iterrows()):
    location_dict = {k:0 for k in unique_cuisines}
    for k in row['Cuisines'].split(","):
        location_dict[k.strip()] = 1
    cuisine_vals_list.append(location_dict)
    
alldata = alldata.drop("Cuisines", axis=1)
alldata = pd.concat((alldata, pd.DataFrame(cuisine_vals_list).fillna(0).astype(np.int8)), axis=1)
alldata.head()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,Restaurant,Location,Average_Cost,Minimum_Order,Rating,Votes,Reviews,North Indian,French,Mithai,Konkan,Bohri,Cafe,Bakery,Turkish,North Eastern,Mishti,Bubble Tea,Hot dogs,Vietnamese,Pizza,Japanese,Odia,Mughlai,Tamil,Maharashtrian,Rolls,Momos,Bengali,Naga,Mangalorean,Indian,Thai,Mexican,Korean,Tibetan,Chettinad,Biryani,Parsi,Raw Meats,Street Food,Middle Eastern,Assamese,Burger,Rajasthani,Roast Chicken,Tex-Mex,South Indian,Bangladeshi,Ice Cream,Goan,Malaysian,Poké,Asian,Finger Food,Iranian,Cantonese,Coffee,South American,Seafood,Lucknowi,Charcoal Chicken,American,Bar Food,Paan,Malwani,Kebab,Lebanese,Belgian,Continental,Andhra,Frozen Yogurt,Wraps,Italian,Kashmiri,Bihari,African,Kerala,Sri Lankan,Greek,Chinese,Gujarati,Desserts,Awadhi,Juices,Indonesian,Healthy Food,Brazilian,European,Hyderabadi,Spanish,Portuguese,Afghan,BBQ,Arabian,Fast Food,Burmese,German,Steak,Sandwich,Modern Indian,Tea,Mediterranean,Beverages,Nepalese,Israeli,Sushi,Salad
0,ID_6321,"FTI College, Law College Road, Pune",₹200,₹50,3.5,12,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,ID_2882,"Sector 3, Marathalli",₹100,₹50,3.5,11,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ID_1595,Mumbai Central,₹150,₹50,3.6,99,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,ID_5929,"Sector 1, Noida",₹250,₹99,3.7,176,95,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,ID_6123,"Rmz Centennial, I Gate, Whitefield",₹200,₹99,3.2,521,235,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# Adding City as Feature

In [6]:
city = ['Pune', 'Bangalore', 'Mumbai', 'Noida', 'Bangalore', 'Delhi', 'Pune', 'Delhi', 'Kolkata',
        'Mumbai', 'Pune', 'Kolkata', 'Noida', 'Noida', 'Bangalore', 'Gurgaon', 'Kolkata',
        'Mumbai', 'Pune', 'Hyderabad', 'Bangalore', 'Kolkata', 'Delhi', 'Hyderabad',
        'Gurgaon', 'Delhi', 'Mumbai', 'Delhi', 'Pune', 'Hyderabad', 'Kolkata', 'Hyderabad', 'Hyderabad', 'Hyderabad', 'Bangalore']

location = alldata['Location'].unique().tolist()

city_mapping = dict(zip(location, city))
print("{:<15}{}".format('City', 'Location'))
print("".join("-"*50))
for key, val in city_mapping.items():
    print("{:<15}{}".format(val, key))

alldata['City'] = alldata['Location'].map(city_mapping)
alldata = alldata.drop("Location", axis=1)

City           Location
--------------------------------------------------
Pune           FTI College, Law College Road, Pune
Bangalore      Sector 3, Marathalli
Mumbai         Mumbai Central
Noida          Sector 1, Noida
Bangalore      Rmz Centennial, I Gate, Whitefield
Delhi          Delhi University-GTB Nagar
Pune           Yerawada, Pune, Maharashtra
Delhi          Delhi Administration Flats, Timarpur
Kolkata        Moulali, Kolkata
Mumbai         Dockyard Road, Mumbai CST Area
Pune           Pune University
Kolkata        Gora Bazar, Rajbari, North Dumdum, Kolkata
Noida          D-Block, Sector 63, Noida
Noida          Sector 14, Noida
Bangalore      Mico Layout, Stage 2, BTM Layout,Bangalore
Gurgaon        Laxman Vihar Industrial Area, Sector 3A, Gurgoan
Kolkata        Tiretti, Kolkata
Mumbai         Sandhurst Road, Mumbai CST Area
Pune           MG Road, Pune
Hyderabad      Hyderabad Public School, Begumpet
Bangalore      Majestic
Kolkata        Chandni Chowk, Kolkata
Delhi    

# Numerical Feature Cleaning

In [7]:
for col in ['Rating', 'Votes', 'Reviews']:
    alldata[col] = pd.to_numeric(alldata[col], errors='coerce').fillna(-99)
    
alldata['Average_Cost'] = pd.to_numeric(alldata['Average_Cost'].str[1:].str.replace(',',''), errors='coerce')
alldata['Average_Cost'] = alldata['Average_Cost'].fillna(-99).astype(int)

alldata['Minimum_Order'] = alldata['Minimum_Order'].str[1:].astype(int)

# Other useful Derived Features

In [8]:
alldata['Minimum_Order_Zero'] = np.where(alldata['Minimum_Order'] == 0, 1, 0)

alldata['Reviews_by_Votes'] = alldata['Reviews'] / alldata['Votes']
alldata['Minimum_Order_to_Cost'] = alldata['Minimum_Order'] / alldata['Average_Cost']
alldata["Location_num_Res"] = alldata["City"].map(alldata.groupby("City").Restaurant.nunique())
alldata["Restaurant_branch_count"] = alldata["Restaurant"].map(alldata["Restaurant"].value_counts())

alldata.columns.tolist()

['Restaurant',
 'Average_Cost',
 'Minimum_Order',
 'Rating',
 'Votes',
 'Reviews',
 'North Indian',
 'French',
 'Mithai',
 'Konkan',
 'Bohri',
 'Cafe',
 'Bakery',
 'Turkish',
 'North Eastern',
 'Mishti',
 'Bubble Tea',
 'Hot dogs',
 'Vietnamese',
 'Pizza',
 'Japanese',
 'Odia',
 'Mughlai',
 'Tamil',
 'Maharashtrian',
 'Rolls',
 'Momos',
 'Bengali',
 'Naga',
 'Mangalorean',
 'Indian',
 'Thai',
 'Mexican',
 'Korean',
 'Tibetan',
 'Chettinad',
 'Biryani',
 'Parsi',
 'Raw Meats',
 'Street Food',
 'Middle Eastern',
 'Assamese',
 'Burger',
 'Rajasthani',
 'Roast Chicken',
 'Tex-Mex',
 'South Indian',
 'Bangladeshi',
 'Ice Cream',
 'Goan',
 'Malaysian',
 'Poké',
 'Asian',
 'Finger Food',
 'Iranian',
 'Cantonese',
 'Coffee',
 'South American',
 'Seafood',
 'Lucknowi',
 'Charcoal Chicken',
 'American',
 'Bar Food',
 'Paan',
 'Malwani',
 'Kebab',
 'Lebanese',
 'Belgian',
 'Continental',
 'Andhra',
 'Frozen Yogurt',
 'Wraps',
 'Italian',
 'Kashmiri',
 'Bihari',
 'African',
 'Kerala',
 'Sri Lankan

In [9]:
num_cols = ['Votes', 'Reviews', 'Rating', 'Average_Cost', 'Minimum_Order', 
            'Restaurant_branch_count', 'Location_num_Res', 'Reviews_by_Votes', 'Minimum_Order_to_Cost']
cat_cols = [col for col in alldata.columns if col not in num_cols]
features = pd.get_dummies(alldata.drop(num_cols, axis=1), columns=cat_cols, sparse=True)

In [10]:
features = features.sparse.to_coo()

In [11]:
import scipy
num_features=scipy.sparse.coo_matrix(alldata[num_cols].values) 

In [12]:
features=scipy.sparse.hstack([features, num_features]).tocsr()

In [13]:
train_ohe = features[:train.shape[0], :]
test_ohe = features[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

(11094, 8882)
(2774, 8882)


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_ohe, train_target, test_size=0.20, random_state=314, stratify=train_target)

In [15]:
import lightgbm as lgb

lgb_fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'multi_logloss', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose':100
           }

lgb_params = {'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'verbose': 0,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'num_class': 7,
 'feature_fraction': 0.8,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'learning_rate': 0.1,
 'max_bin': 255,
 'max_depth': -1,
 'min_data_in_bin': 1,
 'min_data_in_leaf': 1,
 'num_leaves': 31}
lgb_params

{'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'verbose': 0,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'num_class': 7,
 'feature_fraction': 0.8,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'learning_rate': 0.1,
 'max_bin': 255,
 'max_depth': -1,
 'min_data_in_bin': 1,
 'min_data_in_leaf': 1,
 'num_leaves': 31}

In [16]:
clf_lgb = lgb.LGBMClassifier(n_estimators=10000, **lgb_params, random_state=123456789, n_jobs=-1)
clf_lgb.fit(X_train, y_train, **lgb_fit_params)
clf_lgb.best_iteration_

Training until validation scores don't improve for 50 rounds.
[100]	valid's multi_logloss: 0.582777
[200]	valid's multi_logloss: 0.560735
[300]	valid's multi_logloss: 0.555217
Early stopping, best iteration is:
[303]	valid's multi_logloss: 0.554839


303

# Model Training on Complete Data

In [17]:
clf_lgb_fulldata = lgb.LGBMClassifier(n_estimators=int(clf_lgb.best_iteration_*1.2), **lgb_params)
clf_lgb_fulldata.fit(train_ohe, train_target)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=1, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
               importance_type='split', lambda_l1=0.01, lambda_l2=0.01,
               learning_rate=0.1, max_bin=255, max_depth=-1,
               metric='multi_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_bin=1, min_data_in_leaf=1,
               min_split_gain=0.0, n_estimators=363, n_jobs=-1, num_class=7,
               num_leaves=31, objective='multiclass', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, ...)

In [18]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf_rf_fulldata=RandomForestClassifier(n_estimators=2000, max_features=0.1)
clf_rf_fulldata.fit(train_ohe, train_target)

CPU times: user 16min 30s, sys: 768 ms, total: 16min 30s
Wall time: 16min 30s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=0.1, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
%%time
clf_rf2_fulldata=RandomForestClassifier(n_estimators=1000, max_features=0.1)
clf_rf2_fulldata.fit(train_ohe, train_target)

CPU times: user 8min 12s, sys: 392 ms, total: 8min 13s
Wall time: 8min 13s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=0.1, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Ensembling lightgbm and RF model

In [20]:
predictions = np.mean((clf_lgb_fulldata.predict_proba(test_ohe), 
                       clf_rf_fulldata.predict_proba(test_ohe)), axis=0)
predictions = np.argmax(predictions, axis=1)

In [21]:
filename = 'submission_mean_ens_model17.xlsx'

# Make submission
submission = pd.DataFrame({target: pd.Series(predictions).map(class_map_rev).apply(lambda x: str(x)+" minutes")})
submission.to_excel(filename, index=False)
submission.head()

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes


In [23]:
from scipy.stats import hmean
predictions = hmean((np.clip(clf_lgb_fulldata.predict_proba(test_ohe), 0.001, 1),
                     np.clip(clf_rf_fulldata.predict_proba(test_ohe), 0.001, 1)), axis=0)
predictions = np.argmax(predictions, axis=1)

In [24]:
filename = 'submission_hmean_ens_model17.xlsx'

# Make submission
submission2 = pd.DataFrame({target: pd.Series(predictions).map(class_map_rev).apply(lambda x: str(x)+" minutes")})
submission2.to_excel(filename, index=False)
submission2.head()

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes


In [26]:
predictions = np.mean((clf_lgb_fulldata.predict_proba(test_ohe), 
                       clf_rf2_fulldata.predict_proba(test_ohe)), axis=0)
predictions = np.argmax(predictions, axis=1)

In [27]:
filename = 'submission_mean2_ens_model17.xlsx'

# Make submission
submission3 = pd.DataFrame({target: pd.Series(predictions).map(class_map_rev).apply(lambda x: str(x)+" minutes")})
submission3.to_excel(filename, index=False)
submission3.head()

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes


# Second Layer - Mode based Ensemble 

In [29]:
pred1 = pd.read_excel('submission_mean_ens_model17.xlsx', index_col=None)
pred2 = pd.read_excel('submission_hmean_ens_model17.xlsx', index_col=None)
pred3 = pd.read_excel('submission_mean2_ens_model17.xlsx', index_col=None)

In [30]:
predictions = (pd.concat((pred1['Delivery_Time'],
                    pred2['Delivery_Time'],
                    pred3['Delivery_Time']), axis=1)).mode(axis=1)

In [31]:
submission_final = pd.DataFrame({'Delivery_Time': predictions[0]})
submission_final.to_excel("runs_3ensemble_v2.xlsx", index=False)
submission_final.head()

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes
