In [30]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import xgboost as xgb
from statsmodels.nonparametric.smoothers_lowess import lowess
pd.set_option('display.max_columns', 100)

In [31]:
df_train = pd.read_csv('/home/akshay/Downloads/Train_xyqdbho.csv')
df_test = pd.read_csv('/home/akshay/Downloads/Test_pyI9Owa.csv')

In [32]:
# lowess filter for Direction of Wind to smoothen the noise.....
filtered = []
for i in range(0, len(df_train)-100, 100):
#     print(i)
    temp = list(lowess(df_train.Direction_Of_Wind[i:i+100],range(100), frac=0.25, it=0, return_sorted=False))
    for j in temp:
        filtered.append(j)
temp = list(lowess(df_train.Direction_Of_Wind[114500:114539],range(39), frac=0.25, it=0, return_sorted=False))
for j in temp:
    filtered.append(j)

In [33]:
# lowess filter for Direction of Wind.....
filtered_test = []
for i in range(0, len(df_test)-100, 100):
#     print(i)
    temp = list(lowess(df_test.Direction_Of_Wind[i:i+100],range(100), frac=0.25, it=0, return_sorted=False))
    for j in temp:
        filtered_test.append(j)
temp = list(lowess(df_test.Direction_Of_Wind[39400:39420],range(20), frac=0.25, it=0, return_sorted=False))
for j in temp:
    filtered_test.append(j)

In [34]:
df_train['filtered_dow'] = np.array(filtered)
df_test['filtered_dow'] = np.array(filtered_test)

In [35]:
#Date-time features
df_test['Date'] = pd.to_datetime(df_test.Date, dayfirst=True)
df_train['Date'] = pd.to_datetime(df_train.Date, dayfirst=True)

df_test['Weekday'] = df_test.Date.apply(lambda x: x.weekday())
df_train['Weekday'] = df_train.Date.apply(lambda x: x.weekday())

df_test['Month'] = df_test.Date.apply(lambda x: x.month)
df_train['Month'] = df_train.Date.apply(lambda x: x.month)

df_test['Year'] = df_test.Date.apply(lambda x: x.year)
df_train['Year'] = df_train.Date.apply(lambda x: x.year)

In [36]:
# aggregating months into seasons
def get_seasons(month):
    if 3 <= month <= 6:
        return 1
    elif 6 < month <= 9:
        return 2
    elif 9 < month <= 12:
        return 3
    else:
        return 4

df_test['Season'] = df_test.Month.apply(get_seasons)
df_train['Season'] = df_train.Month.apply(get_seasons)

In [37]:
df_train['Day'] = df_train.Date.apply(lambda x: x.day)
df_test['Day'] = df_test.Date.apply(lambda x: x.day)

In [38]:
df_train['day_of_year'] = df_train['Month']*30+df_train['Day']
df_test['day_of_year'] = df_test['Month']*30+df_test['Day']

# Percent change features which proved to be very effective.....
df_train['pct_chng_avg_br_speed'] = df_train.Average_Breeze_Speed.pct_change()
df_test['pct_chng_avg_br_speed'] = df_test.Average_Breeze_Speed.pct_change()

df_train['pct_chng_dow'] = df_train.Direction_Of_Wind.pct_change()
df_test['pct_chng_dow'] = df_test.Direction_Of_Wind.pct_change()

df_train['pct_chng_pressure'] = df_train.Average_Atmospheric_Pressure.pct_change()
df_test['pct_chng_pressure'] = df_test.Average_Atmospheric_Pressure.pct_change()

df_train['Avg_pollution'] = (df_train.Max_Ambient_Pollution + df_train.Min_Ambient_Pollution)/2
df_test['Avg_pollution'] = (df_test.Max_Ambient_Pollution + df_test.Min_Ambient_Pollution)/2
df_train['pct_chng_pollution'] = df_train.Avg_pollution.pct_change()
df_test['pct_chng_pollution'] = df_test.Avg_pollution.pct_change()

df_train['pct_chng_avg_moist'] = df_train.Average_Moisture_In_Park.pct_change()
df_test['pct_chng_avg_moist'] = df_test.Average_Moisture_In_Park.pct_change()

In [39]:
# Rolling mean features again to smooth out noise.....
df_train['rolling_mean_avg_br_speed_w30'] = pd.rolling_mean(df_train.Average_Breeze_Speed, window=30)
df_train['rolling_mean_avg_br_speed_w3'] = pd.rolling_mean(df_train.Average_Breeze_Speed, window=3)
df_train['rolling_mean_avg_br_speed_w7'] = pd.rolling_mean(df_train.Average_Breeze_Speed, window=7)
df_train['rolling_mean_avg_br_speed_w9'] = pd.rolling_mean(df_train.Average_Breeze_Speed, window=9)

df_test['rolling_mean_avg_br_speed_w30'] = pd.rolling_mean(df_test.Average_Breeze_Speed, window=30)
df_test['rolling_mean_avg_br_speed_w3'] = pd.rolling_mean(df_test.Average_Breeze_Speed, window=3)
df_test['rolling_mean_avg_br_speed_w7'] = pd.rolling_mean(df_test.Average_Breeze_Speed, window=7)
df_test['rolling_mean_avg_br_speed_w9'] = pd.rolling_mean(df_test.Average_Breeze_Speed, window=9)

In [40]:
df_train['rolling_mean_avg_pressure_w3'] = pd.rolling_mean(df_train.Average_Atmospheric_Pressure, window=3)
df_train['rolling_mean_avg_pressure_w7'] = pd.rolling_mean(df_train.Average_Atmospheric_Pressure, window=7)
df_train['rolling_mean_avg_pressure_w9'] = pd.rolling_mean(df_train.Average_Atmospheric_Pressure, window=9)

df_test['rolling_mean_avg_pressure_w3'] = pd.rolling_mean(df_test.Average_Atmospheric_Pressure, window=3)
df_test['rolling_mean_avg_pressure_w7'] = pd.rolling_mean(df_test.Average_Atmospheric_Pressure, window=7)
df_test['rolling_mean_avg_pressure_w9'] = pd.rolling_mean(df_test.Average_Atmospheric_Pressure, window=9)

In [41]:
df_train['rolling_mean_avg_moist_w3'] = pd.rolling_mean(df_train.Average_Moisture_In_Park, window=3)
df_train['rolling_mean_avg_moist_w7'] = pd.rolling_mean(df_train.Average_Moisture_In_Park, window=7)
df_train['rolling_mean_avg_moist_w9'] = pd.rolling_mean(df_train.Average_Moisture_In_Park, window=9)

df_test['rolling_mean_avg_moist_w3'] = pd.rolling_mean(df_test.Average_Moisture_In_Park, window=3)
df_test['rolling_mean_avg_moist_w7'] = pd.rolling_mean(df_test.Average_Moisture_In_Park, window=7)
df_test['rolling_mean_avg_moist_w9'] = pd.rolling_mean(df_test.Average_Moisture_In_Park, window=9)

In [42]:
df_train['rolling_mean_dow_w30'] = pd.rolling_mean(df_train.Direction_Of_Wind, window=30)
df_train['rolling_mean_dow_w3'] = pd.rolling_mean(df_train.Direction_Of_Wind, window=3)
df_train['rolling_mean_dow_w7'] = pd.rolling_mean(df_train.Direction_Of_Wind, window=7)
df_train['rolling_mean_dow_w9'] = pd.rolling_mean(df_train.Direction_Of_Wind, window=9)

df_test['rolling_mean_dow_w30'] = pd.rolling_mean(df_test.Direction_Of_Wind, window=30)
df_test['rolling_mean_dow_w3'] = pd.rolling_mean(df_test.Direction_Of_Wind, window=3)
df_test['rolling_mean_dow_w7'] = pd.rolling_mean(df_test.Direction_Of_Wind, window=7)
df_test['rolling_mean_dow_w9'] = pd.rolling_mean(df_test.Direction_Of_Wind, window=9)

In [43]:
df_train['rolling_mean_avg_pollution_w3'] = pd.rolling_mean(df_train.Avg_pollution, window=3)
df_train['rolling_mean_avg_pollution_w7'] = pd.rolling_mean(df_train.Avg_pollution, window=7)
df_train['rolling_mean_avg_pollution_w9'] = pd.rolling_mean(df_train.Avg_pollution, window=9)

df_test['rolling_mean_avg_pollution_w3'] = pd.rolling_mean(df_test.Avg_pollution, window=3)
df_test['rolling_mean_avg_pollution_w7'] = pd.rolling_mean(df_test.Avg_pollution, window=7)
df_test['rolling_mean_avg_pollution_w9'] = pd.rolling_mean(df_test.Avg_pollution, window=9)

In [44]:
# A moonshot feature which helped.....
x=df_train.groupby(['Park_ID', 'Month'], as_index=False)['Min_Moisture_In_Park'].mean()

df_train = pd.merge(df_train, x, on=['Park_ID', 'Month'], how='left', suffixes=("", '_min_moist'))
df_test = pd.merge(df_test,x, on=['Park_ID', 'Month'], how='left', suffixes=("",'_min_moist'))

# Range of pressure feature
df_train['range_pressure'] = df_train['Max_Atmospheric_Pressure'] - df_train['Min_Atmospheric_Pressure']
df_test['range_pressure'] = df_test['Max_Atmospheric_Pressure'] - df_test['Min_Atmospheric_Pressure']

In [45]:
# Rolling mean of percent change.....
df_train['rolling_mean_pct_chng_dow'] = pd.rolling_mean(df_train.pct_chng_dow, window=7)
df_test['rolling_mean_pct_chng_dow'] = pd.rolling_mean(df_test.pct_chng_dow, window=7)

In [46]:
# Rolling differences of various features.....
def delta (array):
    return array[0] - array[1]

df_train['change_in_dow'] = pd.rolling_apply(df_train.Direction_Of_Wind, window=2, func=delta)
df_test['change_in_dow'] = pd.rolling_apply(df_test.Direction_Of_Wind, window=2, func=delta)

df_train['change_in_avg_br_speed'] = pd.rolling_apply(df_train.Average_Breeze_Speed, window=2, func=delta)
df_test['change_in_avg_br_speed'] = pd.rolling_apply(df_test.Average_Breeze_Speed, window=2, func=delta)

df_train['change_br_speed_pu_dow'] = df_train.change_in_avg_br_speed/df_train.change_in_dow
df_test['change_br_speed_pu_dow'] = df_test.change_in_avg_br_speed/df_test.change_in_dow

In [47]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [48]:
# Binning direction of wind.....
def DOW_to_cat(dow):
    if 0 <= dow < 45:
        return 1
    elif 45 <= dow < 90:
        return 2
    elif 90 <= dow < 135:
        return 3
    elif 135 <= dow < 180:
        return 4
    elif 180 <= dow < 225:
        return 5
    elif 225 <= dow < 270:
        return 6
    elif 270 <= dow < 315:
        return 7
    else:
        if dow!=-1:
            return 8

df_train['dow_cat'] = df_train.Direction_Of_Wind.apply(DOW_to_cat)
df_test['dow_cat'] = df_test.Direction_Of_Wind.apply(DOW_to_cat)

# Binning Average Breeze speed.....
def beaufort_scale (speed):
    if 0<=speed<=10:
        return 1
    elif 10<speed<=50:
        return 2
    elif 50<speed<=100:
        return 3
    else:
        if speed != -1:
            return 4

df_train['Average_bf_scale'] = df_train.Average_Breeze_Speed.apply(beaufort_scale)
df_test['Average_bf_scale'] = df_test.Average_Breeze_Speed.apply(beaufort_scale)

In [49]:
df_train.fillna(-1, inplace=True)
df_test.fillna(-1, inplace=True)

In [54]:
# Fitting Xgboost.....
xgb_reg = xgb.XGBRegressor( missing= -1, n_estimators=300, learning_rate=0.02, max_depth=17, subsample=0.9, min_child_weight=3, colsample_bytree=0.7, reg_alpha=100, reg_lambda=100, silent=False)

In [51]:
# Selecting final predictors.....
predictors = df_train.columns.drop([ 'change_in_avg_br_speed','change_in_dow','ID','Footfall','Weekday', 'Date','Year', 'Location_Type','Avg_pollution'])

### Latest-----> 98.07:
Index(['Park_ID', 'Direction_Of_Wind', 'Average_Breeze_Speed',
       'Max_Breeze_Speed', 'Min_Breeze_Speed', 'Var1',
       'Average_Atmospheric_Pressure', 'Max_Atmospheric_Pressure',
       'Min_Atmospheric_Pressure', 'Min_Ambient_Pollution',
       'Max_Ambient_Pollution', 'Average_Moisture_In_Park',
       'Max_Moisture_In_Park', 'Min_Moisture_In_Park', 'filtered_dow', 'Month',
       'Season', 'Day', 'day_of_year', 'pct_chng_avg_br_speed', 'pct_chng_dow',
       'pct_chng_pressure', 'pct_chng_pollution', 'pct_chng_avg_moist',
       'rolling_mean_avg_br_speed_w30', 'rolling_mean_avg_br_speed_w3',
       'rolling_mean_avg_br_speed_w7', 'rolling_mean_avg_br_speed_w9',
       'rolling_mean_avg_pressure_w3', 'rolling_mean_avg_pressure_w7',
       'rolling_mean_avg_pressure_w9', 'rolling_mean_avg_moist_w3',
       'rolling_mean_avg_moist_w7', 'rolling_mean_avg_moist_w9',
       'rolling_mean_dow_w30', 'rolling_mean_dow_w3', 'rolling_mean_dow_w7',
       'rolling_mean_dow_w9', 'rolling_mean_avg_pollution_w3',
       'rolling_mean_avg_pollution_w7', 'rolling_mean_avg_pollution_w9',
       'rolling_mean_pct_chng_dow', 'Min_Moisture_In_Park_min_moist','range_pressure',
       'dow_cat', 'Average_bf_scale',''change_br_speed_pu_dow''],
      dtype='object')

In [52]:
(predictors)

Index(['Park_ID', 'Direction_Of_Wind', 'Average_Breeze_Speed',
       'Max_Breeze_Speed', 'Min_Breeze_Speed', 'Var1',
       'Average_Atmospheric_Pressure', 'Max_Atmospheric_Pressure',
       'Min_Atmospheric_Pressure', 'Min_Ambient_Pollution',
       'Max_Ambient_Pollution', 'Average_Moisture_In_Park',
       'Max_Moisture_In_Park', 'Min_Moisture_In_Park', 'filtered_dow', 'Month',
       'Season', 'Day', 'day_of_year', 'pct_chng_avg_br_speed', 'pct_chng_dow',
       'pct_chng_pressure', 'pct_chng_pollution', 'pct_chng_avg_moist',
       'rolling_mean_avg_br_speed_w30', 'rolling_mean_avg_br_speed_w3',
       'rolling_mean_avg_br_speed_w7', 'rolling_mean_avg_br_speed_w9',
       'rolling_mean_avg_pressure_w3', 'rolling_mean_avg_pressure_w7',
       'rolling_mean_avg_pressure_w9', 'rolling_mean_avg_moist_w3',
       'rolling_mean_avg_moist_w7', 'rolling_mean_avg_moist_w9',
       'rolling_mean_dow_w30', 'rolling_mean_dow_w3', 'rolling_mean_dow_w7',
       'rolling_mean_dow_w9', 'rolling_

Index(['Park_ID', 'Direction_Of_Wind', 'Average_Breeze_Speed',
       'Max_Breeze_Speed', 'Min_Breeze_Speed', 'Var1',
       'Average_Atmospheric_Pressure', 'Max_Atmospheric_Pressure',
       'Min_Atmospheric_Pressure', 'Min_Ambient_Pollution',
       'Max_Ambient_Pollution', 'Average_Moisture_In_Park',
       'Max_Moisture_In_Park', 'Min_Moisture_In_Park', 'filtered_dow', 'Month',
       'Season', 'Day', 'day_of_year', 'pct_chng_avg_br_speed', 'pct_chng_dow',
       'pct_chng_pressure', 'pct_chng_pollution', 'pct_chng_avg_moist',
       'rolling_mean_avg_br_speed_w30', 'rolling_mean_avg_br_speed_w3',
       'rolling_mean_avg_br_speed_w7', 'rolling_mean_avg_br_speed_w9',
       'rolling_mean_avg_pressure_w3', 'rolling_mean_avg_pressure_w7',
       'rolling_mean_avg_pressure_w9', 'rolling_mean_avg_moist_w3',
       'rolling_mean_avg_moist_w7', 'rolling_mean_avg_moist_w9',
       'rolling_mean_dow_w30', 'rolling_mean_dow_w3', 'rolling_mean_dow_w7',
       'rolling_mean_dow_w9', 'rolling_mean_avg_pollution_w3',
       'rolling_mean_avg_pollution_w7', 'rolling_mean_avg_pollution_w9',
       'rolling_mean_pct_chng_dow', 'Min_Moisture_In_Park_min_moist',
       'range_pressure', 'dow_cat', 'Average_bf_scale',
       'change_br_speed_pu_dow'],
      dtype='object')

In [55]:
% time xgb_reg.fit(df_train[predictors], df_train.Footfall)

CPU times: user 14min 11s, sys: 3.51 s, total: 14min 14s
Wall time: 7min 49s


XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=17,
       min_child_weight=3, missing=-1, n_estimators=300, nthread=-1,
       objective='reg:linear', reg_alpha=100, reg_lambda=100,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.9)

In [None]:
preds = xgb_reg.predict(df_test[predictors])

In [None]:
temp_=pd.DataFrame({'ID':df_test.ID, 'Footfall':np.round(preds)})

temp_.to_csv('sol_xgb_test.csv', index=False)