In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows',None)
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")


In [None]:
df=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
test0=pd.read_csv('test.csv')

# Data Fields

**datetime** - hourly date + timestamp  

**season**-  1 = spring, 2 = summer, 3 = fall, 4 = winter

**holiday** - whether the day is considered a holiday

**workingday** - whether the day is neither a weekend nor holiday

**weather** - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

**temp** - temperature in Celsius

**atemp**- "feels like" temperature in Celsius

**humidity** - relative humidity

**windspeed** - wind speed

**casual** - number of non-registered user rentals initiated

**registered** - number of registered user rentals initiated

**count** - number of total rentals


In [None]:
df.head()

In [None]:
test.head()

Remove irrelevant features ; casual and registered

In [None]:
df.drop(columns=['casual','registered'],inplace=True)


In [None]:
df.info()

Convert datetime to hour,weekday,month,year

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.strftime('%a')
df['month'] = df['datetime'].dt.strftime('%b')
df['year'] = df['datetime'].dt.year
df.drop(columns=['datetime'],inplace=True)
test['datetime'] = pd.to_datetime(test['datetime'])
test['hour'] = test['datetime'].dt.hour
test['day'] = test['datetime'].dt.strftime('%a')
test['month'] = test['datetime'].dt.strftime('%b')
test['year'] = test['datetime'].dt.year
test.drop(columns=['datetime'],inplace=True)

convert season,weather,holiday,workingday to string

In [None]:
df['season'] = df['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall,', 4: 'Winter'})
df['weather'] = df['weather'].map({1: 'Clear', 2: 'Cloudy', 3: 'Light Rain', 4: 'Heavy Rain'})
df['holiday'] = df['holiday'].map({0: 'No', 1: 'Yes'})
df['workingday'] = df['workingday'].map({0: 'No', 1: 'Yes'})
test['season'] = test['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall,', 4: 'Winter'})
test['weather'] = test['weather'].map({1: 'Clear', 2: 'Cloudy', 3: 'Light Rain', 4: 'Heavy Rain'})
test['holiday'] = test['holiday'].map({0: 'No', 1: 'Yes'})
test['workingday'] = test['workingday'].map({0: 'No', 1: 'Yes'})

In [None]:
df.head()

In [None]:
df.info()

hour and year are not in object datatype.lets convert

In [None]:
df['hour']=df['hour'].astype(object)
test['hour']=test['hour'].astype(object)
df['year']=df['year'].astype(object)
test['year']=test['year'].astype(object)


In [None]:
df.info()

In [None]:
categorical=['season','holiday','workingday','weather','hour','day','month','year']
numeric=['temp','atemp','humidity','windspeed','count']

 Plot categorical

In [None]:
plt.figure(figsize=(20,20))
for index,feature in enumerate(categorical):
  plt.subplot(3, 4, index+1)
  sns.barplot(x=feature, y='count', data=df)
  plt.xlabel(f'{feature}')
  plt.ylabel('Categorical Variable')
  plt.title(f'{feature} vs count')
plt.show()

Plot Numeric

In [None]:
plt.figure(figsize=(20,20))
for index,feature in enumerate(numeric):
  plt.subplot(3, 4, index+1)
  sns.scatterplot(x=feature, y='count', data=df)
  plt.xlabel(f'{feature}')
  plt.ylabel('Numeric Variable')
  plt.title(f'{feature} vs count')
plt.show()

Checking for outliers

In [None]:
plt.figure(figsize=(20,20))
for index,feature in enumerate(numeric):
  plt.subplot(3, 4, index+1)
  sns.boxplot(data=df[feature])
  plt.xlabel(feature)
  plt.ylabel('Value')
plt.show()

removing outliers

In [None]:
df=df[(df['count']<600) & (df['windspeed']<=30)]

In [None]:
plt.figure(figsize=(20,20))
for index,feature in enumerate(numeric):
  plt.subplot(3, 4, index+1)
  sns.boxplot(data=df[feature])
  plt.xlabel(feature)
  plt.ylabel('Value')
plt.show()

Distribution of count

In [None]:
sns.displot(df['count'], kde=True, color='blue')
plt.title('Distribution of Count')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

Right side skewed

Tranformation of count distribution

In [None]:
df['count'] = np.log1p(df['count'])
sns.histplot(df['count'], kde=True, color='blue')
plt.title('Log-transformed Distribution of Target Variable')
plt.xlabel('Log-transformed Target Variable Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


checking correlation

In [None]:
sns.heatmap(df[numeric].corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
clean_df=df.copy()

In [None]:
clean_df.head()

convert categorical into numeric

In [None]:
clean_df.info()

In [None]:
test.info()

In [None]:
final_df = pd.get_dummies(clean_df, columns=categorical, drop_first=True)
test = pd.get_dummies(test, columns=categorical, drop_first=True)

In [None]:
final_df = final_df.astype(int)
test = test.astype(int)

splitting and training

In [None]:
print('final_df shape:', final_df.shape)
print('train shape:', df.shape)
print('test shape:',  test.shape)

In [None]:
test.info()

In [None]:
final_df.info()

In [None]:
final_df.head()

Scaling using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
def scaling(df):
  scaler=MinMaxScaler()
  num_cols=['temp', 'humidity', 'windspeed']
  df[num_cols]=scaler.fit_transform(df[num_cols])
  return df

In [None]:
scaled_df=scaling(final_df)
scaled_test=scaling(test)

In [None]:
X=scaled_df.drop(columns=['count'])
y=scaled_df['count']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
def rmsle(y,pred):
  log_y=np.log1p(y)
  log_pred=np.log1p(pred)
  squared_error=(log_y-log_pred)**2
  rmsle=np.sqrt(np.mean(squared_error))
  return rmsle
rmsles = {}

def evaluate(reg_cls, params=None):
    print('Training model with', reg_cls, '...')
    reg = reg_cls()
    if params:
        reg = GridSearchCV(reg, param_grid=params, refit=True)
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    y_test_exp = np.expm1(y_test)
    pred_exp = np.expm1(pred)
    rmsles[reg_cls] = rmsle(y_test_exp, pred_exp)
    if reg_cls in [LinearRegression,Ridge,Lasso,RandomForestRegressor,GradientBoostingRegressor]:
      return reg,pred_exp
    else:
      return reg.best_estimator_, pred_exp

params_xgb = {
    'n_estimators': [100*i for i in range(1, 6)],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 5, 6]
}

params_lgbm = {
    'n_estimators': [100*i for i in range(1, 6)],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 5, 6]
}

params_cb = {
    'iterations': [100*i for i in range(1, 6)],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 5, 6]
}

Ir_reg, pred_Ir = evaluate(LinearRegression)
rg_reg, pred_rg = evaluate(Ridge)
ls_reg, pred_ls = evaluate(Lasso)
xg_reg, pred_xg = evaluate(XGBRegressor, params_xgb)
lg_reg, pred_1g = evaluate(LGBMRegressor, params_lgbm)
cb_reg, pred_cb = evaluate(CatBoostRegressor, params_cb)
rf_reg, pred_rf = evaluate(RandomForestRegressor)
gb_reg, pred_gb = evaluate(GradientBoostingRegressor)

best_model = min(rmsles, key=rmsles.get)
print('Best model:', best_model)
print('RMSLE:', rmsles[best_model])

In [None]:
predictions = lg_reg.predict(scaled_test)
original_predictions = np.expm1(predictions)
predictions_df = pd.DataFrame({'datetime': test0['datetime'], 'count': original_predictions})
predictions_df.to_csv('predictions.csv', index=False)