In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Date,Weather,Year,Month,Hour,Holiday,Normalized_Temperature,Normalized_Feeling_Temperature,Normalized_Humidity,Windspeed,Count_of_Casual_Users,Count_of_Registered_Users,Count_of_Rented_Bikes
0,17/12/13,"Clear, Few clouds, Partly cloudy",2013,12,14,0,0.281667,0.269267,0.453371,0.363818,30,207,237
1,28/02/14,"Clear, Few clouds, Partly cloudy",2014,2,21,0,0.35,0.3535,0.461124,0.110118,10,179,189
2,28/03/14,"Clear, Few clouds, Partly cloudy",2014,3,22,0,0.5725,0.546411,0.645618,0.141042,21,152,173
3,12/05/14,"Mist + Cloudy, Mist + Broken clouds, Mist + Fe...",2014,5,21,0,0.602917,0.609989,0.606742,0.138794,74,156,230
4,15/05/13,"Mist + Cloudy, Mist + Broken clouds, Mist + Fe...",2013,5,1,0,0.520417,0.511111,1.007865,0.112365,4,43,47


In [27]:
train.shape

(12165, 13)

In [28]:
train.duplicated().sum()

0

In [29]:
train.isnull().sum()

Date                              0
Weather                           0
Year                              0
Month                             0
Hour                              0
Holiday                           0
Normalized_Temperature            0
Normalized_Feeling_Temperature    0
Normalized_Humidity               0
Windspeed                         0
Count_of_Casual_Users             0
Count_of_Registered_Users         0
Count_of_Rented_Bikes             0
dtype: int64

In [30]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Date,Weather,Year,Month,Hour,Holiday,Normalized_Temperature,Normalized_Feeling_Temperature,Normalized_Humidity,Windspeed
0,08/08/14,"Mist + Cloudy, Mist + Broken clouds, Mist + Fe...",2014,8,14,0,0.8025,0.742956,0.522247,0.257071
1,27/07/14,"Clear, Few clouds, Partly cloudy",2014,7,20,0,0.78125,0.749067,0.593371,0.105624
2,22/08/14,"Clear, Few clouds, Partly cloudy",2014,8,2,0,0.622083,0.601456,0.782247,0.001124
3,10/02/14,"Mist + Cloudy, Mist + Broken clouds, Mist + Fe...",2014,2,22,0,0.329583,0.351833,0.671236,0.011236
4,24/06/14,"Mist + Cloudy, Mist + Broken clouds, Mist + Fe...",2014,6,4,0,0.6725,0.623422,0.65,0.011236


In [31]:
test.shape

(5214, 10)

In [32]:
test.duplicated().sum()

0

In [33]:
test.isnull().sum()

Date                              0
Weather                           0
Year                              0
Month                             0
Hour                              0
Holiday                           0
Normalized_Temperature            0
Normalized_Feeling_Temperature    0
Normalized_Humidity               0
Windspeed                         0
dtype: int64

In [34]:
set(train.columns) - set(test.columns)

{'Count_of_Casual_Users', 'Count_of_Registered_Users', 'Count_of_Rented_Bikes'}

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [36]:
train['Date'] = pd.to_datetime(train['Date'], format='%d/%m/%y')
test['Date'] = pd.to_datetime(test['Date'], format='%d/%m/%y')

In [37]:
for df in [train, test]:
    df['Day_of_Week'] = df['Date'].dt.dayofweek
    df['Day_of_Month'] = df['Date'].dt.day
    df['Week_of_Year'] = df['Date'].dt.isocalendar().week


In [38]:
train['Lag_1'] = train['Count_of_Rented_Bikes'].shift(1)
train['Rolling_Mean_3'] = train['Count_of_Rented_Bikes'].shift(1).rolling(window=3).mean()
train['Rolling_Mean_7'] = train['Count_of_Rented_Bikes'].shift(1).rolling(window=7).mean()

In [39]:
train.dropna(inplace=True)

In [40]:
features = [
    'Year', 'Month', 'Hour', 'Holiday', 'Normalized_Temperature', 
    'Normalized_Feeling_Temperature', 'Normalized_Humidity', 
    'Windspeed', 'Day_of_Week', 'Day_of_Month', 'Week_of_Year', 
    'Lag_1', 'Rolling_Mean_3', 'Rolling_Mean_7'
]


In [41]:
X = train[features]
y = train['Count_of_Rented_Bikes']

In [42]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [43]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse}')

RMSE: 53.345933858651726


In [44]:
test['Lag_1'] = train['Count_of_Rented_Bikes'].iloc[-1]  # Use the last value from the train set for the first lag
test['Rolling_Mean_3'] = train['Count_of_Rented_Bikes'].rolling(window=3).mean().iloc[-1]
test['Rolling_Mean_7'] = train['Count_of_Rented_Bikes'].rolling(window=7).mean().iloc[-1]

In [45]:
test['Lag_1'].fillna(method='ffill', inplace=True)
test['Rolling_Mean_3'].fillna(method='ffill', inplace=True)
test['Rolling_Mean_7'].fillna(method='ffill', inplace=True)

In [52]:
test_predictions = rf_model.predict(test[features])
test['Count_of_Rented_Bikes'] = test['Count_of_Rented_Bikes'].astype(int)
test['Count_of_Rented_Bikes'] = test_predictions

In [53]:
a = pd.DataFrame(test_predictions, columns= ['Count_of_Rented_Bikes'])
a.to_csv('RF.csv', index = False)