In [2]:
#Invincible_Predictors_Model_Selection

#Importing Libraries
import pandas as pd
import numpy as np
import sklearn

In [3]:
#Importing CSV Files
air_reserve = pd.read_csv('air_reserve.csv',parse_dates=['visit_datetime', 'reserve_datetime'])
air_store_info = pd.read_csv('air_store_info.csv')
store_id_relation = pd.read_csv('store_id_relation.csv')
date_info = pd.read_csv('date_info.csv',parse_dates=['calendar_date'])
train = pd.read_csv('train.csv',parse_dates=['visit_date'])

In [4]:
#dropping the day of week column as we would already extract it from datetime value.
date_info.drop(columns=['day_of_week'],inplace=True)

#renaming date column so that it will be useful while merging the holiday flag with training data.
date_info.rename(columns={'calendar_date':'visit_date'},inplace=True)

In [5]:
#preparing final training dataset by merging relevant features to the train data.
train_data = train.merge(air_store_info, how='left', on='air_store_id')

#Extracting year month weekday as new features
train_data["visit_year"] = pd.DatetimeIndex(train_data['visit_date']).year
train_data["visit_month"] = pd.DatetimeIndex(train_data['visit_date']).month
train_data["visit_weekday"] = pd.DatetimeIndex(train_data['visit_date']).weekday

#Extracting city ward neighbourhood as new features
train_data['city'] = train_data['air_area_name'].str.split().str[0]
train_data['ward'] = train_data['air_area_name'].str.split().str[1]
train_data['neighborhood'] = train_data['air_area_name'].str.split().str[2]

#Add holiday flag from date info table
train_data = train_data.merge(date_info,how='left',on='visit_date')

#Making all object type columns as categorical columns.
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].astype('category')
        
#Implementing labelencoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in ['city','ward','neighborhood','holiday_flg','air_genre_name','air_area_name']:
    train_data[feature] = le.fit_transform(train_data[feature].astype(str))

    
# #Implementing OneHotEncoding using get dummies method
train_data = pd.concat([train_data,pd.get_dummies(train_data['holiday_flg'], prefix='holiday_flg')],axis=1)
train_data.drop(['holiday_flg'],axis=1, inplace=True)     

In [6]:
#Adding mean median and min max visitors column grouped by air store id and weekday
group_by_cols = ['air_store_id','visit_weekday']
visitor_stats = train_data\
                .groupby(group_by_cols)\
                ['visitors']\
                .agg(['mean', 'median', 'min','max'])\
                .rename(columns=lambda colname: str(colname)+'_visitors')\
                .reset_index()

In [7]:
visitor_stats

Unnamed: 0,air_store_id,visit_weekday,mean_visitors,median_visitors,min_visitors,max_visitors
0,air_00a91d42b08b08d9,0,22.727273,20.0,1.0,47.0
1,air_00a91d42b08b08d9,1,23.578947,24.0,1.0,41.0
2,air_00a91d42b08b08d9,2,28.243243,28.0,15.0,52.0
3,air_00a91d42b08b08d9,3,29.542857,30.0,15.0,47.0
4,air_00a91d42b08b08d9,4,36.459459,35.0,20.0,57.0
...,...,...,...,...,...,...
5798,air_fff68b929994bfbd,2,4.666667,4.0,1.0,12.0
5799,air_fff68b929994bfbd,3,4.871795,5.0,1.0,12.0
5800,air_fff68b929994bfbd,4,5.682927,5.0,1.0,17.0
5801,air_fff68b929994bfbd,5,7.605263,7.0,2.0,18.0


In [8]:
train_data = train_data.merge(visitor_stats,how='left',on=group_by_cols)

In [81]:
train_data.head()

Unnamed: 0,air_store_id,visit_date,visitors,air_genre_name,air_area_name,latitude,longitude,visit_year,visit_month,visit_weekday,city,ward,neighborhood,holiday_flg_0,holiday_flg_1,mean_visitors,median_visitors,min_visitors,max_visitors
0,air_e3020992d5fe5dfd,2016-07-01,21,6,46,35.670651,139.771861,2016,7,4,7,6,90,1,0,15.925,18.0,3.0,25.0
1,air_e3020992d5fe5dfd,2016-07-02,19,6,46,35.670651,139.771861,2016,7,5,7,6,90,1,0,13.405405,14.0,2.0,30.0
2,air_e3020992d5fe5dfd,2016-07-04,8,6,46,35.670651,139.771861,2016,7,0,7,6,90,1,0,9.933333,10.0,2.0,22.0
3,air_e3020992d5fe5dfd,2016-07-05,11,6,46,35.670651,139.771861,2016,7,1,7,6,90,1,0,11.157895,10.5,2.0,24.0
4,air_e3020992d5fe5dfd,2016-07-06,16,6,46,35.670651,139.771861,2016,7,2,7,6,90,1,0,12.5,13.0,2.0,20.0


In [9]:
train_data.to_pickle("./ModelTraining_TrainData.pkl")

In [10]:
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

In [24]:
#train test split
from sklearn.model_selection import train_test_split
X = train_data.drop(["air_store_id","visit_date","visitors","air_area_name","longitude"], axis=1)
y = train_data["visitors"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [83]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225227 entries, 0 to 239672
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   air_genre_name   225227 non-null  int32  
 1   latitude         225227 non-null  float64
 2   visit_year       225227 non-null  int64  
 3   visit_month      225227 non-null  int64  
 4   visit_weekday    225227 non-null  int64  
 5   city             225227 non-null  int32  
 6   ward             225227 non-null  int32  
 7   neighborhood     225227 non-null  int32  
 8   holiday_flg_0    225227 non-null  uint8  
 9   holiday_flg_1    225227 non-null  uint8  
 10  mean_visitors    225227 non-null  float64
 11  median_visitors  225227 non-null  float64
 12  min_visitors     225227 non-null  float64
 13  max_visitors     225227 non-null  float64
dtypes: float64(5), int32(4), int64(3), uint8(2)
memory usage: 19.3 MB


In [73]:
#Trying simple Linear Regression model

from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_preds=lr_model.predict(X_test)
rmsle(y_test, y_preds)

0.5358758809585568

In [74]:
#Trying KNeighbors Regression model

from sklearn.neighbors import KNeighborsRegressor
knr_model = KNeighborsRegressor(n_jobs=-1, n_neighbors=10)
knr_model.fit(X_train, y_train)
y_preds=knr_model.predict(X_test)
rmsle(y_test, y_preds)

0.5226004963298835

In [75]:
#Trying Random Forest Regressor Regression model 

from sklearn.ensemble import RandomForestRegressor

rfrmodel = RandomForestRegressor(n_estimators=20, n_jobs=-1, 
                                 max_samples=None)

rfrmodel.fit(X_train, y_train)
y_preds=rfrmodel.predict(X_test)
rmsle(y_test, y_preds)

0.5450944134760952

In [76]:
from sklearn.model_selection import GridSearchCV

params_grid = { "n_estimators": [20], 
                 "n_jobs": [-1], 
                "max_samples": [None],
               "min_samples_split": [1,5,10,15],
               "min_samples_leaf": [1,2,3,4,5],
               }
grid_search = GridSearchCV(rfrmodel, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=-1, scoring='neg_mean_squared_log_error')
grid_search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.6min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_estimators=20, n_jobs=-1),
             n_jobs=-1,
             param_grid={'max_samples': [None],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [1, 5, 10, 15],
                         'n_estimators': [20], 'n_jobs': [-1]},
             scoring='neg_mean_squared_log_error', verbose=-1)

In [77]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 15,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 20,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [78]:
#Trying Random Forest Regressor Regression model 

from sklearn.ensemble import RandomForestRegressor

rfrmodel = RandomForestRegressor(n_estimators=200, min_samples_leaf=5,
                                 min_samples_split=15,
                                 max_features=1, n_jobs=-1, 
                                 )

rfrmodel.fit(X_train, y_train)
y_preds=rfrmodel.predict(X_test)
rmsle(y_test, y_preds)

0.5106684368905897