In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

In [2]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
0,,289.28,0.0,0.0,40,Cloudy skies,Partly cloudy skies,02/10/08 9:00,02/10/08,5555
1,,290.26,0.0,0.0,75,Cloudy skies,Fragmented clouds,02/10/08 10:00,02/10/08,4525
2,,290.28,0.0,0.0,90,Cloudy skies,Full cloud cover,02/10/08 11:00,02/10/08,4772
3,,290.33,0.0,0.0,90,Cloudy skies,Full cloud cover,02/10/08 12:00,02/10/08,5031
4,,292.14,0.0,0.0,75,Cloudy skies,Fragmented clouds,02/10/08 13:00,02/10/08,4928


In [3]:
train.shape

(38373, 10)

In [4]:
train.isnull().sum()

Holiday               38323
Temperature               0
Rainfall_last_hour        0
Snowfall_last_hour        0
Cloud_Cover               0
Weather                   0
Weather_Desc              0
TimeStamp                 0
Date                      0
Traffic_Vol               0
dtype: int64

In [5]:
train.duplicated().sum()

0

In [6]:
test = pd.read_csv('Test.csv')
test.head()

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
0,,297.65,0.0,0,1,Clear skies,Clear skies,01/07/14 0:00,01/07/14,
1,,297.45,0.0,0,1,Clear skies,Clear skies,01/07/14 1:00,01/07/14,
2,,296.75,0.0,0,1,Rainfall,Soft rain,01/07/14 2:00,01/07/14,
3,,296.42,0.0,0,1,Rainfall,Steady rain,01/07/14 3:00,01/07/14,
4,,295.56,0.0,0,40,Rainfall,Soft rain,01/07/14 4:00,01/07/14,


In [7]:
test.shape

(2208, 10)

In [8]:
test.isnull().sum()

Holiday               2205
Temperature              0
Rainfall_last_hour       0
Snowfall_last_hour       0
Cloud_Cover              0
Weather                  0
Weather_Desc             0
TimeStamp                0
Date                     0
Traffic_Vol           2208
dtype: int64

In [9]:
test.duplicated().sum()

0

In [10]:
train.tail()

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
38368,,301.38,0.0,0.0,90,Cloudy skies,Full cloud cover,30/06/14 19:00,30/06/14,3252
38369,,300.28,0.0,0.0,91,Cloudy skies,Full cloud cover,30/06/14 20:00,30/06/14,2941
38370,,300.48,0.0,0.0,91,Cloudy skies,Full cloud cover,30/06/14 21:00,30/06/14,2703
38371,,299.08,0.0,0.0,75,Cloudy skies,Fragmented clouds,30/06/14 22:00,30/06/14,2762
38372,,298.22,0.0,0.0,40,Cloudy skies,Partly cloudy skies,30/06/14 23:00,30/06/14,2023


In [11]:
combined = pd.concat([train, test], axis = 0)
combined.head()

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
0,,289.28,0.0,0.0,40,Cloudy skies,Partly cloudy skies,02/10/08 9:00,02/10/08,5555.0
1,,290.26,0.0,0.0,75,Cloudy skies,Fragmented clouds,02/10/08 10:00,02/10/08,4525.0
2,,290.28,0.0,0.0,90,Cloudy skies,Full cloud cover,02/10/08 11:00,02/10/08,4772.0
3,,290.33,0.0,0.0,90,Cloudy skies,Full cloud cover,02/10/08 12:00,02/10/08,5031.0
4,,292.14,0.0,0.0,75,Cloudy skies,Fragmented clouds,02/10/08 13:00,02/10/08,4928.0


In [12]:
combined.shape

(40581, 10)

In [13]:
combined[combined['Holiday'].notna()]

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
126,1.0,274.08,0.0,0.0,20,Cloudy skies,Scattered clouds,08/10/08 0:00,08/10/08,462.0
926,1.0,288.62,0.0,0.0,87,Clear skies,Clear skies,12/11/08 0:00,12/11/08,1004.0
1164,1.0,279.24,0.0,0.0,21,Light fog,Fine fog,22/11/08 0:00,22/11/08,921.0
1938,1.0,265.1,0.0,0.0,90,Cloudy skies,Full cloud cover,25/12/08 0:00,25/12/08,810.0
2103,1.0,263.89,0.0,0.0,59,Cloudy skies,Fragmented clouds,01/01/09 0:00,01/01/09,1442.0
3066,1.0,259.16,0.0,0.0,21,Cloudy skies,Scattered clouds,18/02/09 0:00,18/02/09,562.0
5236,1.0,287.17,0.0,0.0,90,Cloudy skies,Full cloud cover,27/05/09 0:00,27/05/09,869.0
6075,1.0,291.08,0.0,0.0,1,Clear skies,Clear skies,04/07/09 0:00,04/07/09,1066.0
7178,1.0,297.82,0.0,0.0,12,Cloudy skies,Scattered clouds,22/08/09 0:00,22/08/09,665.0
7340,1.0,289.78,0.0,0.0,0,Clear skies,Clear skies,02/09/09 0:00,02/09/09,1044.0


In [14]:
train['Weather'].value_counts(), test['Weather'].value_counts() 

(Weather
 Cloudy skies          14477
 Clear skies           12682
 Rainfall               4204
 Light fog              2889
 Snowfall               2297
 Airborne particles      736
 Light rain              497
 Stormy weather          395
 Dense fog               182
 Airborne smoke           13
 Sudden windstorm          1
 Name: count, dtype: int64,
 Weather
 Clear skies           687
 Cloudy skies          646
 Rainfall              565
 Light fog             183
 Stormy weather         58
 Airborne particles     30
 Dense fog              19
 Light rain             18
 Airborne smoke          2
 Name: count, dtype: int64)

In [15]:
train['Weather_Desc'].value_counts(), test['Weather_Desc'].value_counts() 

(Weather_Desc
 Clear skies                            12682
 Full cloud cover                        4947
 Fragmented clouds                       4455
 Partly cloudy skies                     3232
 Fine fog                                2889
 Soft rain                               2530
 Scattered clouds                        1843
 Flurries                                1588
 Steady rain                             1182
 Smoky air                                736
 Deep snow                                500
 Intense rainfall                         365
 Mild drizzle                             345
 Approaching thunderstorm                 290
 Snowfall                                 192
 Dense fog                                182
 Light rain                               135
 Nearby rain showers                       98
 Stormy weather                            53
 Storm with mild rain                      21
 Torrential downpour                       18
 Strong drizzle     

In [16]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40581 entries, 0 to 2207
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Holiday             53 non-null     float64
 1   Temperature         40581 non-null  float64
 2   Rainfall_last_hour  40581 non-null  float64
 3   Snowfall_last_hour  40581 non-null  float64
 4   Cloud_Cover         40581 non-null  int64  
 5   Weather             40581 non-null  object 
 6   Weather_Desc        40581 non-null  object 
 7   TimeStamp           40581 non-null  object 
 8   Date                40581 non-null  object 
 9   Traffic_Vol         38373 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 3.4+ MB


combined['Holiday'] = combined['Holiday'].fillna(0.0)

In [18]:
weather_mapping = {
    'Clear skies': 'Clear skies',
    'Cloudy skies': 'Cloudy skies',
    'Rainfall': 'Rain',
    'Light rain': 'Rain',
    'Stormy weather': 'Rain',
    'Snowfall': 'Snow',
    'Airborne particles': 'Fog',
    'Light fog': 'Fog',
    'Dense fog': 'Fog',
    'Airborne smoke': 'Fog',
    'Sudden windstorm': 'Other'
}

In [19]:
weather_desc_mapping = {
    'Clear skies': 'Clear skies',
    'Full cloud cover': 'Cloudy skies',
    'Fragmented clouds': 'Cloudy skies',
    'Partly cloudy skies': 'Cloudy skies',
    'Scattered clouds': 'Cloudy skies',
    'Fine fog': 'Fog',
    'Dense fog': 'Fog',
    'Smoky air': 'Fog',
    'Soft rain': 'Rain',
    'Steady rain': 'Rain',
    'Intense rainfall': 'Rain',
    'Mild drizzle': 'Rain',
    'Stormy weather': 'Rain',
    'Approaching thunderstorm': 'Rain',
    'Snowfall': 'Snow',
    'Flurries': 'Snow',
    'Deep snow': 'Snow',
    'Other': 'Other'
}

In [20]:
combined['Weather'] = combined['Weather'].map(weather_mapping).fillna('Other')

In [21]:
combined['Weather_Desc'] = combined['Weather_Desc'].map(weather_desc_mapping).fillna('Other')

In [22]:
combined['Weather'].value_counts(), combined['Weather_Desc'].value_counts()

(Weather
 Cloudy skies    15123
 Clear skies     13369
 Rain             5737
 Fog              4054
 Snow             2297
 Other               1
 Name: count, dtype: int64,
 Weather_Desc
 Cloudy skies    15123
 Clear skies     13369
 Rain             5376
 Fog              4039
 Snow             2280
 Other             394
 Name: count, dtype: int64)

In [23]:
combined['TimeStamp'] = pd.to_datetime(combined['TimeStamp'], format='%d/%m/%y %H:%M')

In [24]:
combined['Hour'] = combined['TimeStamp'].dt.hour
combined['Day'] = combined['TimeStamp'].dt.day
combined['Month'] = combined['TimeStamp'].dt.month
combined['Day_of_week'] = combined['TimeStamp'].dt.dayofweek

In [25]:
combined = combined.drop(columns=['TimeStamp', 'Date', 'Holiday'])

In [26]:
le = LabelEncoder()

x = ['Weather', 'Weather_Desc']
for i in x:
    combined[i] = le.fit_transform(combined[i])

In [27]:
combined.head()

Unnamed: 0,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,Traffic_Vol,Hour,Day,Month,Day_of_week
0,289.28,0.0,0.0,40,1,1,5555.0,9,2,10,3
1,290.26,0.0,0.0,75,1,1,4525.0,10,2,10,3
2,290.28,0.0,0.0,90,1,1,4772.0,11,2,10,3
3,290.33,0.0,0.0,90,1,1,5031.0,12,2,10,3
4,292.14,0.0,0.0,75,1,1,4928.0,13,2,10,3


In [28]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40581 entries, 0 to 2207
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Temperature         40581 non-null  float64
 1   Rainfall_last_hour  40581 non-null  float64
 2   Snowfall_last_hour  40581 non-null  float64
 3   Cloud_Cover         40581 non-null  int64  
 4   Weather             40581 non-null  int32  
 5   Weather_Desc        40581 non-null  int32  
 6   Traffic_Vol         38373 non-null  float64
 7   Hour                40581 non-null  int32  
 8   Day                 40581 non-null  int32  
 9   Month               40581 non-null  int32  
 10  Day_of_week         40581 non-null  int32  
dtypes: float64(4), int32(6), int64(1)
memory usage: 2.8 MB


In [29]:
newtrain = combined.iloc[0:38373, :]
newtest = combined.iloc[38373: , :]
newtest = newtest.drop('Traffic_Vol', axis = 1)

In [30]:
newtrain.shape, newtest.shape

((38373, 11), (2208, 10))

In [31]:
x = newtrain.drop('Traffic_Vol', axis = 1)
y = newtrain['Traffic_Vol']

In [32]:
sc = StandardScaler()
x = sc.fit_transform(x)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state =  42)

In [34]:
models = {'Linear Regression': LinearRegression(), 'Random Forest' : RandomForestRegressor(), 'Bagging' : BaggingRegressor(),
          'Extra Tree': ExtraTreesRegressor(), 'LightGBM': LGBMRegressor(), 'Gradient Boosting': GradientBoostingRegressor(),
          'Adaboost': AdaBoostRegressor(), 'XGB': XGBRegressor() }

In [35]:
def evaluate_models(x_train, x_test, y_train, y_test, models):
    results = {}
    for name, model in models.items():
        predictions = model.fit(x_train, y_train).predict(x_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        results[name] = rmse
    return results

In [36]:
result = evaluate_models(x_train, x_test, y_train, y_test, models)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 30698, number of used features: 10
[LightGBM] [Info] Start training from score 3296.553033


In [37]:
best_model_name = min(result, key = result.get)
best_model = models[best_model_name]

In [38]:
print(f"Best model is {best_model_name} with rmse {result[best_model_name]}")

Best model is LightGBM with rmse 364.5691005990524


In [39]:
y_pred = best_model.fit(x_train, y_train).predict(x_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 673
[LightGBM] [Info] Number of data points in the train set: 30698, number of used features: 10
[LightGBM] [Info] Start training from score 3296.553033


In [40]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

364.5691005990524


In [41]:
x_train = newtrain.drop('Traffic_Vol', axis = 1)
y_train = newtrain['Traffic_Vol']
x_test = newtest
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
y_pred = best_model.fit(x_train, y_train).predict(x_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 678
[LightGBM] [Info] Number of data points in the train set: 38373, number of used features: 10
[LightGBM] [Info] Start training from score 3293.947567


In [42]:
solution = pd.DataFrame({'Traffic_Vol' : y_pred})
solution.head()

Unnamed: 0,Traffic_Vol
0,1398.393962
1,872.274894
2,674.889647
3,476.196086
4,475.224475


In [43]:
solution.to_csv('Solution.csv', index = False)