In [None]:
# %%capture
# !pip install -q kaggle
from google.colab import files
files.upload()

# Load and Extract Dataset

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jul-2021

Downloading test.csv to /content
  0% 0.00/151k [00:00<?, ?B/s]
100% 151k/151k [00:00<00:00, 34.0MB/s]
Downloading train.csv to /content
  0% 0.00/580k [00:00<?, ?B/s]
100% 580k/580k [00:00<00:00, 37.8MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/76.9k [00:00<?, ?B/s]
100% 76.9k/76.9k [00:00<00:00, 79.7MB/s]


# Import Libraries

In [None]:
%%capture

# !pip install tpot
# !pip install mljar-supervised


In [None]:
# Warning Libraries 
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)

# Scientific and Data Manipulation Libraries 
import pandas as pd
import numpy as np
import math
import gc
import os


# Data Preprocessing, Machine Learning and Metrics Libraries 
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

from sklearn.ensemble                 import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics                  import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

# model visualization
# !pip install shap
# import shap




# Boosting Algorithms 
from xgboost                          import XGBRegressor
!pip install catboost
from catboost                         import CatBoostRegressor
from lightgbm                         import LGBMRegressor

# Tuning
from sklearn.model_selection import GridSearchCV
!pip install optuna

import optuna




# Data Visualization Libraries 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px


SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

# EDA

In [None]:
# read data
date_cols = ['date_time']
train = pd.read_csv('/content/train.csv', parse_dates=date_cols)
test = pd.read_csv('/content/test.csv', parse_dates=date_cols)

sub = pd.read_csv('/content/sample_submission.csv')

# Looks at the first 5 rows of the Train and Test data
display('Train Head :',train.head())
display('Test Head :',test.head())


# Displays Information of Columns of Train and Test data
train.info()
test.info()


# Displaya Descriptive Statistics of Train and Test data
display('Train Description :',train.describe())
display('Test  Description :',test.describe())

'Train Head :'

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8


'Test Head :'

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5
0,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1
1,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0
2,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8
3,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0
4,2011-01-01 04:00:00,4.5,57.5,0.465,1022.4,838.5,871.5,967.0,1142.3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7111 entries, 0 to 7110
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_time               7111 non-null   datetime64[ns]
 1   deg_C                   7111 non-null   float64       
 2   relative_humidity       7111 non-null   float64       
 3   absolute_humidity       7111 non-null   float64       
 4   sensor_1                7111 non-null   float64       
 5   sensor_2                7111 non-null   float64       
 6   sensor_3                7111 non-null   float64       
 7   sensor_4                7111 non-null   float64       
 8   sensor_5                7111 non-null   float64       
 9   target_carbon_monoxide  7111 non-null   float64       
 10  target_benzene          7111 non-null   float64       
 11  target_nitrogen_oxides  7111 non-null   float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 

'Train Description :'

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
count,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0
mean,20.878034,47.561004,1.110309,1091.5721,938.06497,883.903305,1513.238349,998.335565,2.086219,10.237083,204.066784
std,7.937917,17.398731,0.39895,218.537554,281.978988,310.456355,350.18031,381.537695,1.447109,7.694426,193.927723
min,1.3,8.9,0.1988,620.3,364.0,310.6,552.9,242.7,0.1,0.1,1.9
25%,14.9,33.7,0.8559,930.25,734.9,681.05,1320.35,722.85,1.0,4.5,76.45
50%,20.7,47.3,1.0835,1060.5,914.2,827.8,1513.1,928.7,1.7,8.5,141.0
75%,25.8,60.8,1.40415,1215.8,1124.1,1008.85,1720.4,1224.7,2.8,14.2,260.0
max,46.1,90.8,2.231,2088.3,2302.6,2567.4,2913.8,2594.6,12.5,63.7,1472.3


'Test  Description :'

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5
count,2247.0,2247.0,2247.0,2247.0,2247.0,2247.0,2247.0,2247.0
mean,10.808144,51.031242,0.627053,1106.53449,836.459769,828.321495,1104.850601,1029.851535
std,6.444497,16.665047,0.266588,205.341455,272.816585,339.511779,293.112225,434.863287
min,-1.8,9.8,0.1847,665.9,356.2,320.1,523.4,218.8
25%,5.6,36.9,0.41335,951.5,640.7,597.05,899.45,688.55
50%,9.8,50.6,0.5964,1080.4,800.8,757.1,1076.2,973.1
75%,14.2,63.55,0.80495,1222.1,1016.1,944.95,1288.35,1324.0
max,30.9,88.8,1.393,1882.9,1776.1,1975.0,2211.4,2593.8


In [None]:
display(train[train.duplicated(keep=False)])

display(test[test.duplicated(keep=False)])

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides


Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5


In [None]:
train.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

# Data Transformation

In [None]:
# train.date_time
train['hour'] = train.date_time.dt.hour
# train['day'] = train.date_time.dt.day
train['day_of_week'] = train.date_time.dt.dayofweek
# train['week'] = train.date_time.dt.week
train['weekday'] = train.date_time.dt.weekday
# train['month'] = train.date_time.dt.month


test['hour'] = test.date_time.dt.hour
# test['day'] = test.date_time.dt.day
test['day_of_week'] = test.date_time.dt.dayofweek
# test['week'] = test.date_time.dt.week
test['weekday'] = test.date_time.dt.weekday
# test['month'] = test.date_time.dt.month

In [None]:
# test dataset
test =  test.drop('date_time',axis=1)

# feature list
not_features = ['date_time', 'target_carbon_monoxide',	'target_benzene',	'target_nitrogen_oxides']
features = [feature for feature in train.columns if feature not in not_features]
column_list = features

# Scaling features
scaler = StandardScaler() # StandardScaler()  RobustScaler()  MinMaxScaler() MaxAbsScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])


# ML Dataset
X=train.drop(['target_carbon_monoxide',	'target_benzene',	'target_nitrogen_oxides', 'date_time'],axis=1)
y=train[['target_carbon_monoxide',	'target_benzene',	'target_nitrogen_oxides']]


# Displays Correlation between Features through HeatMap - Ligther Color means Higher Correlation
# plt.figure(figsize=(12,8))
# sns.heatmap(train.corr(), annot = True)

In [None]:
X.hist()

In [None]:
X.shape , y.shape, test.shape

((7111, 11), (7111, 3), (2247, 11))

# Modeling

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=30)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_SEED)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_eval.shape, y_eval.shape



((4977, 11), (2134, 11), (4977, 3), (2134, 3))

In [None]:
y_train.target_carbon_monoxide, y_train.target_benzene,  y_train.target_nitrogen_oxides

In [None]:
from numpy import absolute, mean, std

rf_params = {'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 400}

# define model
# model = CatBoostRegressor()
# model = LGBMRegressor()
# model = KNeighborsRegressor()
# model = MLPRegressor(solver='lbfgs', alpha=1e-5,
#                    hidden_layer_sizes=(5, 2), random_state=1)
# model = RandomForestRegressor(**rf_params)

# model = MultiOutputRegressor(XGBRegressor(objective='reg:linear'))
# model = MultiOutputRegressor(Ridge(random_state=123))

# model.fit(X_train, y_train)




# define base model
# model = RandomForestRegressor(**rf_params)
# define the direct multioutput wrapper model
# wrapper = MultiOutputRegressor(model)

# fit the model on the whole dataset
# wrapper.fit(X_train, y_train)

# define the evaluation procedure
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)



# # evaluate the model and collect the scores
# n_scores = cross_val_score(wrapper, X_train, y_train, scoring='neg_mean_squared_log_error', cv=cv, n_jobs=-1)

# https://scikit-learn.org/stable/modules/model_evaluation.html

# # force the scores to be positive
# n_scores = absolute(n_scores)

# # summarize performance
# print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


# fit the model on the whole dataset
# model.fit(X_train, y_train.target_carbon_monoxide)

# y_pred = model.predict(X_val)
# y_pred = wrapper.predict(X_val)

# print("MSE : ", mean_squared_error(y_val.target_carbon_monoxide, y_pred))
# print("MSLE : ", mean_squared_log_error(y_val.target_carbon_monoxide, y_pred))

# print('Predicted: ', y_pred[0] )
# print('Actual : ', y_val.target_carbon_monoxide )



def predict_col(model, multi_single=None, target_col=None):
  if multi_single == 'm':
    # define the direct multioutput wrapper model
    wrapper = MultiOutputRegressor(model)

    # fit the model on the whole dataset
    wrapper.fit(X_train, y_train)

    # predict 
    y_pred = model.predict(X_val)

  else:
    model.fit(X_train.copy(), y_train[target_col].copy(),
              eval_set=[(X_eval.copy(), y_eval[target_col].copy())],
              early_stopping_rounds=EARLY_STOPPING_ROUND)

    # predict 
    y_pred = model.predict(X_valid.copy())

  print("MSE : ", mean_squared_error(y_valid[target_col], y_pred))
  # print("MSLE : ", mean_squared_log_error(y_val[target_col], y_pred))

  print('Predicted: ', y_pred[0] )
  print('Actual : ', y_valid[target_col] )

  return model.predict(test)

In [None]:
model = RandomForestRegressor()
cat_params_target_carbon_monoxide = {'learning_rate': 0.006, 'depth': 10, 'l2_leaf_reg': 1.0, 'min_child_samples': 8}

cat_params_target_benzene = {'learning_rate': 0.003, 'depth': 13, 'l2_leaf_reg': 2.0, 'min_child_samples': 32}

cat_params_target_nitrogen_oxides = {'learning_rate': 0.004, 'depth': 15, 'l2_leaf_reg': 2.5, 'min_child_samples': 4}

# model_target_carbon_monoxide = CatBoostRegressor(**cat_params_target_carbon_monoxide)
# model_target_benzene = CatBoostRegressor(**cat_params_target_benzene)
# model_target_nitrogen_oxides = CatBoostRegressor(**cat_params_target_nitrogen_oxides)

model_target_carbon_monoxide = model
model_target_benzene = model
model_target_nitrogen_oxides = model

# model = RandomForestRegressor(**rf_params)

# target_carbon_monoxide
# target_benzene
# target_nitrogen_oxides

sub['target_carbon_monoxide'] = predict_col(model_target_carbon_monoxide, multi_single='s',target_col='target_carbon_monoxide' )  
# 0.17301131771321457
# 0.1439158839451627
# 0.13770833322593296 with base catboostregressor

# lgbm 0.149489749
sub['target_benzene'] = predict_col(model_target_benzene, multi_single='s',target_col='target_benzene' ) 
# 1.9593846973113866
# 1.815840539487549
#  1.1844364920274717

# lgbm 1.3413774029580792

sub['target_nitrogen_oxides'] = predict_col(model_target_nitrogen_oxides, multi_single='s',target_col='target_nitrogen_oxides' ) 

# 4593.495173194585
# 4826.898555030105
# 4244.278434534288

# lgbm 4944.245854799639

## Lazy Predict

In [None]:
!pip install lazypredict

from lazypredict.Supervised import LazyRegressor

X_train, X_val, y_train, y_val

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_val, y_train.target_nitrogen_oxides, y_val.target_nitrogen_oxides)

print(models)

"""
target_carbon_monoxide

Model                                        Adjusted R-Squared  R-Squared  RMSE  Time Taken                                                  
HistGradientBoostingRegressor                0.91       0.91  0.42        0.92
LGBMRegressor                                0.91       0.91  0.42        0.26
ExtraTreesRegressor                          0.91       0.91  0.43        1.72
XGBRegressor                                 0.90       0.90  0.44        0.74
RandomForestRegressor                        0.90       0.90  0.44        3.79
GradientBoostingRegressor                    0.90       0.90  0.45        1.52

target_benzene

Model                                        Adjusted R-Squared  R-Squared  RMSE  Time Taken                                                 
MLPRegressor                                 0.97       0.97  1.24        5.96
RandomForestRegressor                        0.97       0.97  1.32        3.69
HistGradientBoostingRegressor                0.97       0.97  1.33        0.63
XGBRegressor                                 0.97       0.97  1.34        0.74
LGBMRegressor                                0.97       0.97  1.34        0.24
GradientBoostingRegressor                    0.97       0.97  1.35        1.50



target_nitrogen_oxides

Model                                        Adjusted R-Squared  R-Squared   RMSE  Time Taken                              
ExtraTreesRegressor                          0.86       0.86  70.05        1.80
LGBMRegressor                                0.86       0.86  71.02        0.26
RandomForestRegressor                        0.86       0.86  71.63        4.18
HistGradientBoostingRegressor                0.85       0.85  72.97        2.76
XGBRegressor                                 0.84       0.85  74.37        0.76
BaggingRegressor                             0.84       0.84  75.13        0.44

"""

In [None]:
model.get_all_params()

In [None]:
sub

Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,1.300887,5.375317,284.593477
1,2011-01-01 01:00:00,1.852405,8.171233,321.641418
2,2011-01-01 02:00:00,1.739569,8.053723,364.275842
3,2011-01-01 03:00:00,1.906957,8.272942,383.292585
4,2011-01-01 04:00:00,1.009719,6.678156,218.295299
...,...,...,...,...
2242,2011-04-04 10:00:00,2.600252,12.472531,558.323880
2243,2011-04-04 11:00:00,2.374553,10.732311,386.739179
2244,2011-04-04 12:00:00,2.536309,12.757674,343.933582
2245,2011-04-04 13:00:00,2.101466,10.702493,345.118898


# Remarks

0.09185886557786073 baseline score with decision tree regressor 75/25 split

1
---
train test split 70/30 state 42 with randomforestRegressor best param

MSE :  1970.3011361792414 , 
MSLE :  0.041923979911548594

2
---
added hr, dayofweek, weekday

MSE :  1625.0299090453007
MSLE :  0.036872675748131384

3
---
Catboost base model with hr, dayofweek , weekday

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74




"""
RandomForestRegressor Tuning
"""

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]

max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}



# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_


"""

CatBoostRegressor Tuning

"""

# Creating the hyperparameter grid
param_dist =   {'depth'         : [6,8,10],
                  'learning_rate' : [0.01, 0.05, 0.1],
                  'iterations'    : [30, 50, 100]
                 }
               
#Instantiate RandomSearchCV object
rscv = RandomizedSearchCV(cbc , param_dist, scoring='accuracy', cv =5)

#Fit the model
rscv.fit(X_train, y_train)

# Print the tuned parameters and score
print(rscv.best_params_)
print(rscv.best_score_)

# Tuning with Optuna

In [None]:
target_col = 'target_nitrogen_oxides'

# target_carbon_monoxide
# target_benzene
# target_nitrogen_oxides
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 9, 15)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'
    
    regressor = CatBoostRegressor(**param)

    regressor.fit(X_train.copy(), y_train[target_col].copy(),
                  eval_set=[(X_eval.copy(), y_eval[target_col].copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND)
    loss = mean_squared_error(y_valid[target_col], regressor.predict(X_valid.copy()))
    return loss

In [None]:
%%time
study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=10000, n_jobs=-1, timeout=24000)

[32m[I 2021-07-07 18:06:42,788][0m A new study created in memory with name: catboost-seed1[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2021-07-07 18:12:12,627][0m Trial 1 finished with value: 4068.702387696062 and parameters: {'learning_rate': 0.001, 'depth': 9, 'l2_leaf_reg': 4.5, 'min_child_samples': 32}. Best is trial 1 with value: 4068.702387696062.[0m
[32m[I 2021-07-07 18:14:01,429][0m Trial 2 finished with value: 3720.216268297722 and parameters: {'learning_rate': 0.018000000000000002, 'depth': 10, 'l2_leaf_reg': 5.5, 'min_child_samples': 1}. Best is trial 2 with value: 3720.216268297722.[0m
[32m[I 2021-07-07 18:15:22,350][0m Trial 3 finished with value: 3752.4395814555764 and parameters: {'learning_rate': 0.012, 'depth': 10, 'l2_leaf_reg': 1.0, 'min_child_samples': 8}. Best is trial 2 with value: 3720.216268297722.[0m
[32m[I 2021-07-07 18:18:15,129][0m Trial 4 finished with value: 3606.9100184128897 and

In [None]:
study.best_value , study.best_params



# Save Result

## Multiple Output

In [None]:
test_preds = model.predict(test)

display(test_preds)

sub.iloc[:, 1:] = test_preds

array([[  1.4    ,   4.1    , 186.5    ],
       [  2.5095 ,   9.56175, 358.5295 ],
       [  2.111  ,   8.61975, 333.07825],
       ...,
       [  2.774  ,  13.924  , 359.981  ],
       [  2.11025,   9.9145 , 253.9685 ],
       [  2.3395 ,  11.16125, 284.69125]])

## Single Column

In [None]:
test_preds = model.predict(test)
test_preds

array([1.4    , 2.588  , 2.002  , ..., 2.53175, 2.06125, 2.279  ])

In [None]:
sub.to_csv("submission.csv", index=False)

In [None]:
!kaggle competitions submit -c tabular-playground-series-jul-2021 -f submission.csv -m " "

100% 166k/166k [00:03<00:00, 48.8kB/s]
Successfully submitted to Tabular Playground Series - Jul 2021