In [32]:
#Invincible_Predictors_Model_Building_And_Predictions

#Importing Libraries
import pandas as pd
import numpy as np

In [33]:
#Importing CSV Files
air_reserve = pd.read_csv('air_reserve.csv',parse_dates=['visit_datetime', 'reserve_datetime'])
air_store_info = pd.read_csv('air_store_info.csv')
store_id_relation = pd.read_csv('store_id_relation.csv')
date_info = pd.read_csv('date_info.csv',parse_dates=['calendar_date'])
train = pd.read_csv('train.csv',parse_dates=['visit_date'])
sample_submission = pd.read_csv('sample_submission.csv')

In [34]:
#dropping the day of week column as we would already extract it from datetime value.
date_info.drop(columns=['day_of_week'],inplace=True)

#renaming date column so that it will be useful while merging the holiday flag with training data.
date_info.rename(columns={'calendar_date':'visit_date'},inplace=True)

In [35]:
#preparing final training dataset by merging relevant features to the train data.
train_data = train.merge(air_store_info, how='left', on='air_store_id')

#Extracting year month weekday as new features
train_data["visit_year"] = pd.DatetimeIndex(train_data['visit_date']).year
train_data["visit_month"] = pd.DatetimeIndex(train_data['visit_date']).month
train_data["visit_weekday"] = pd.DatetimeIndex(train_data['visit_date']).weekday

#Extracting city ward neighbourhood as new features
train_data['city'] = train_data['air_area_name'].str.split().str[0]
train_data['ward'] = train_data['air_area_name'].str.split().str[1]
train_data['neighborhood'] = train_data['air_area_name'].str.split().str[2]

#Add holiday flag from date info table
train_data = train_data.merge(date_info,how='left',on='visit_date')

#Making all object type columns as categorical columns.
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].astype('category')
        
#Implementing labelencoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in ['city','ward','neighborhood','holiday_flg','air_genre_name','air_area_name']:
    train_data[feature] = le.fit_transform(train_data[feature].astype(str))

    
# #Implementing OneHotEncoding using get dummies method
train_data = pd.concat([train_data,pd.get_dummies(train_data['holiday_flg'], prefix='holiday_flg')],axis=1)
train_data.drop(['holiday_flg'],axis=1, inplace=True)    

In [36]:
#Adding mean median and min max visitors column grouped by air store id and weekday

group_by_cols = ['air_store_id','visit_weekday']
visitor_stats = train_data\
                .groupby(group_by_cols)\
                ['visitors']\
                .agg(['mean','median', 'min','max'])\
                .rename(columns=lambda colname: str(colname)+'_visitors')\
                .reset_index()

In [37]:
visitor_stats

Unnamed: 0,air_store_id,visit_weekday,mean_visitors,median_visitors,min_visitors,max_visitors
0,air_00a91d42b08b08d9,0,22.727273,20.0,1.0,47.0
1,air_00a91d42b08b08d9,1,23.578947,24.0,1.0,41.0
2,air_00a91d42b08b08d9,2,28.243243,28.0,15.0,52.0
3,air_00a91d42b08b08d9,3,29.542857,30.0,15.0,47.0
4,air_00a91d42b08b08d9,4,36.459459,35.0,20.0,57.0
...,...,...,...,...,...,...
5798,air_fff68b929994bfbd,2,4.666667,4.0,1.0,12.0
5799,air_fff68b929994bfbd,3,4.871795,5.0,1.0,12.0
5800,air_fff68b929994bfbd,4,5.682927,5.0,1.0,17.0
5801,air_fff68b929994bfbd,5,7.605263,7.0,2.0,18.0


In [38]:
#Merging the train data with visitor stats
train_data = train_data.merge(visitor_stats,how='left',on=group_by_cols)

In [39]:
#Storing the trained data features in pickle
train_data.to_pickle("./PreFinalPred_FullTrainData.pkl")

In [40]:
#Performing exact same operations, applied on train data, for the sample submission data

sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['air_store_id'] = sample_submission['id'].str.rsplit('_',1).str[0]
sample_submission['visit_date'] = sample_submission['id'].str.rsplit('_',1).str[1]
sample_submission.visit_date = pd.to_datetime(sample_submission.visit_date)
sample_submission["visit_year"] = pd.DatetimeIndex(sample_submission['visit_date']).year
sample_submission["visit_month"] = pd.DatetimeIndex(sample_submission['visit_date']).month
sample_submission["visit_weekday"] = pd.DatetimeIndex(sample_submission['visit_date']).weekday
sample_submission = sample_submission.merge(air_store_info, how='left', on='air_store_id')
sample_submission.drop(columns=['id'],inplace=True)
sample_submission['city'] = sample_submission['air_area_name'].str.split().str[0]
sample_submission['ward'] = sample_submission['air_area_name'].str.split().str[1]
sample_submission['neighborhood'] = sample_submission['air_area_name'].str.split().str[2]
sample_submission = sample_submission.merge(date_info,how='left',on='visit_date')

#Making all object type columns as categorical columns.
for col in sample_submission.columns:
    if sample_submission[col].dtype == 'object':
        sample_submission[col] = sample_submission[col].astype('category')

#Implementing labelencoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feature in ['city','ward','neighborhood','holiday_flg','air_genre_name','air_area_name']:
    sample_submission[feature] = le.fit_transform(sample_submission[feature].astype(str))

sample_submission = pd.concat([sample_submission,pd.get_dummies(sample_submission['holiday_flg'], prefix='holiday_flg')],axis=1)
sample_submission.drop(['holiday_flg'],axis=1, inplace=True)

sample_submission=sample_submission[['air_store_id', 'visit_date', 'visitors', 'air_genre_name',
       'air_area_name','latitude', 'longitude','visit_year', 'visit_month', 'visit_weekday', 'city',
       'ward', 'neighborhood', 'holiday_flg_0', 'holiday_flg_1']]


In [41]:
sample_submission = sample_submission.merge(visitor_stats,how='left',on=group_by_cols)

In [42]:
sample_submission = sample_submission.fillna(train_data.mean())

  """Entry point for launching an IPython kernel.


In [43]:
sample_submission.to_pickle("./sample_submission.pkl")

In [44]:
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

In [45]:
#Specifying the training and test data, here test data is our predictions in sample submission.
X_train = train_data.drop(["air_store_id","visit_date","visitors","air_area_name","longitude"], axis=1)
Y_train = train_data["visitors"]

X_test = sample_submission.drop(["air_store_id","visit_date","visitors","air_area_name","longitude"], axis=1)
y_test = sample_submission["visitors"]

In [46]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239673 entries, 0 to 239672
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   air_genre_name   239673 non-null  int32  
 1   latitude         239673 non-null  float64
 2   visit_year       239673 non-null  int64  
 3   visit_month      239673 non-null  int64  
 4   visit_weekday    239673 non-null  int64  
 5   city             239673 non-null  int32  
 6   ward             239673 non-null  int32  
 7   neighborhood     239673 non-null  int32  
 8   holiday_flg_0    239673 non-null  uint8  
 9   holiday_flg_1    239673 non-null  uint8  
 10  mean_visitors    239673 non-null  float64
 11  median_visitors  239673 non-null  float64
 12  min_visitors     239673 non-null  float64
 13  max_visitors     239673 non-null  float64
dtypes: float64(5), int32(4), int64(3), uint8(2)
memory usage: 20.6 MB


In [47]:
#Modelling Linear regression
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)
y_preds_lr=lr_model.predict(X_test)
y_preds_lr

array([10.35217133, 15.47652515, 12.9667967 , ...,  7.37359979,
        8.76314381,  7.24427385])

In [48]:
#Modelling KNeighbors regression
from sklearn.neighbors import KNeighborsRegressor
knr_model = KNeighborsRegressor(n_jobs=-1, n_neighbors=10)
knr_model.fit(X_train, Y_train)
y_preds_knr=knr_model.predict(X_test)
y_preds_knr

array([10.8, 16.6, 12.6, ..., 10.3,  8.9,  7.5])

In [54]:
#Modelling Random Forest regression
from sklearn.ensemble import RandomForestRegressor

rfrmodel = RandomForestRegressor(n_estimators=200, min_samples_leaf=5,
                                 min_samples_split=15,
                                 max_features=1, n_jobs=-1, 
                                 )

rfrmodel.fit(X_train, Y_train)
y_preds_rfr=rfrmodel.predict(X_test)
y_preds_rfr

array([12.15815023, 15.87885469, 12.12582352, ...,  8.41997062,
       10.36516211,  7.99060629])

In [55]:
final_submission = pd.read_csv('sample_submission.csv')

In [56]:
final_submission['visitors']=y_preds_rfr

In [57]:
final_submission

Unnamed: 0,id,visitors
0,air_e3020992d5fe5dfd_2017-04-06,12.158150
1,air_e3020992d5fe5dfd_2017-04-07,15.878855
2,air_e3020992d5fe5dfd_2017-04-08,12.125824
3,air_e3020992d5fe5dfd_2017-04-10,10.307418
4,air_e3020992d5fe5dfd_2017-04-11,10.666235
...,...,...
12430,air_4ce7b17062a1bf73_2017-04-18,6.811946
12431,air_4ce7b17062a1bf73_2017-04-19,9.162706
12432,air_4ce7b17062a1bf73_2017-04-20,8.419971
12433,air_4ce7b17062a1bf73_2017-04-21,10.365162


In [58]:
final_submission.to_csv('prediction_rfr.csv', index=False)