In [1]:
# importing libraries.
import pandas as pd
import numpy as np 

## Loading Datasets
test = pd.read_csv('/home/aakash/Genpact/test_main.csv')
train = pd.read_csv('/home/aakash/Genpact/train/train.csv') ## This is the demand data for raw materials.
train_meal = pd.read_csv('/home/aakash/Genpact/train/meal_info.csv')
train_centers = pd.read_csv('/home/aakash/Genpact/train/fulfilment_center_info.csv')

In [2]:
## EDA
print('train dataset shape',train.shape) # So we have a huge dataset
print('train dataset columns', train.columns)     

train dataset shape (456548, 9)
train dataset columns Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders'],
      dtype='object')


In [3]:
test_id = test.id
print(test.shape)
print(test.columns) ## So num_orders is our target variable.

(32573, 8)
Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured'],
      dtype='object')


In [4]:
train.dtypes ## so clearly our target varibale is Numerical
## So this is a regression analysis problem.

id                         int64
week                       int64
center_id                  int64
meal_id                    int64
checkout_price           float64
base_price               float64
emailer_for_promotion      int64
homepage_featured          int64
num_orders                 int64
dtype: object

In [5]:
train.head()
## A himepage featured meal is likely to have higher num_order.

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [6]:
## Now what I want to do is convert all the 3 train data's into a single dataset.
# Note that center ID and meal ID are going to be the most imprtant features for us.
# Lets check for the null values. if any. So no null values.
print(len(train.meal_id.unique()))  
# So we have 77 unique meal centres and 51 unique meals.
print(train_meal.category.value_counts()) # So most meals actually coreespont to beverages.
print("  ")
print(train_meal.cuisine.value_counts()) # and most people prefer Thai cuisine.

51
Beverages       12
Biryani          3
Fish             3
Desert           3
Starters         3
Sandwich         3
Pizza            3
Seafood          3
Soup             3
Other Snacks     3
Pasta            3
Extras           3
Salad            3
Rice Bowl        3
Name: category, dtype: int64
  
Thai           15
Continental    12
Italian        12
Indian         12
Name: cuisine, dtype: int64


In [7]:
print(train_centers.shape)
train_centers.city_code.value_counts() ## So most meal centres are in city with city ID 590
train_centers.region_code.value_counts() # and most meal centres have a region code 30.
train_centers.center_type.value_counts() # So Type_A might be some centres of high importance.
train_centers.head()

(77, 5)


Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [8]:
## Now lwts compute the mean number of orders..
x = np.mean(train.num_orders)
print(x)

# lets make a baseline submission on test data using this mean.
pd.DataFrame({'id': test.id, 'num_orders' : x}).to_csv('genpact.csv', index = False)

261.8727603669275


In [9]:
# Now lets create new datasets. 
## Creating columns from centre data.
### For Train data ####
for col in train_centers.columns[1:]:
    train[col] = train.center_id.replace(train_centers.set_index('center_id')[col])
    test[col] = test.center_id.replace(train_centers.set_index('center_id')[col])

for col in train_meal.columns[1:]:
    train[col] = train.meal_id.replace(train_meal.set_index('meal_id')[col])
    test[col] = test.meal_id.replace(train_meal.set_index('meal_id')[col])    
    
train['center_type'] = pd.get_dummies(train['center_type'])
test['center_type'] = pd.get_dummies(test['center_type'])

train['cuisine'] = pd.get_dummies(train['cuisine'])
test['cuisine'] = pd.get_dummies(test['cuisine'])

train['category'] = pd.get_dummies(train['category'])
test['category'] = pd.get_dummies(test['category'])

In [10]:
from scipy.stats import skew
print(skew(train.num_orders))

train.num_orders = np.log1p(train.num_orders)
print(skew(train.num_orders))

6.929943296742635
-0.021917552017371056


In [11]:
train_y = train['num_orders']
train_x = train.drop('num_orders', axis = 1)
train_x = train_x.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [15]:
## Trying some tree based models
# Using Decision Tree 
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

## Splitting the training data into train and validation sets.
x_train, x_val, y_train, y_val  = train_test_split(train_x, train_y, test_size = 0.25, random_state = 1)

dtree = DecisionTreeRegressor()
dtree.fit(x_train, y_train)

y_pred_val = np.expm1(dtree.predict(x_val))
true_val = np.expm1(y_val)
y_pred_val = np.clip(y_pred_val, np.expm1(y_train.min()), np.expm1(train_y.max()))

np.sqrt(mean_squared_log_error(y_pred_val, true_val))

0.6448518648395613

In [16]:
## Using Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 50, 
                           oob_score = 'TRUE', 
                           n_jobs = -1, 
                           max_features = 'auto', 
                           min_samples_leaf = 5)
rf.fit(x_train, y_train)

y_pred_val = np.expm1(rf.predict(x_val))
true_val = np.expm1(y_val)
y_pred_val = np.clip(y_pred_val, np.expm1(y_train.min()), np.expm1(train_y.max()))

np.sqrt(mean_squared_log_error(y_pred_val, true_val))

0.47560510110682547

In [18]:
# Using Extra Tree Regressor
from sklearn.ensemble import ExtraTreesRegressor

et = ExtraTreesRegressor(n_estimators = 50, min_samples_leaf = 4,n_jobs = -1)
et.fit(x_train, y_train)

y_pred_val = np.expm1(et.predict(x_val))
true_val = np.expm1(y_val)
y_pred_val = np.clip(y_pred_val, np.expm1(y_train.min()), np.expm1(train_y.max()))

np.sqrt(mean_squared_log_error(y_pred_val, true_val))

0.4801381624326768

In [19]:
## Final prediction on test dataset.
x = np.expm1(et.predict(test))
pd.DataFrame({'id': test_id, 'num_orders' : x}).to_csv('genpact.csv', index = False)