# Define Problem statement

# Import Libraries

In [None]:
# basic libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# statistical imports
import pingouin

In [None]:
# pipeline related imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# for transformer map
from collections import defaultdict

In [None]:
# preprocessing imports
from sklearn.impute import SimpleImputer

In [None]:
#
from sklearn.model_selection import train_test_split

In [None]:
# model import
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
# import for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
import pickle

# Load Data

In [None]:
dataset_day = pd.read_csv('day.csv')
dfd = dataset_day.copy()
dfd

In [None]:
dfd.drop(columns='instant',inplace=True)

dfd.set_index('dteday',inplace=True)

In [None]:
dfd

# Basic check

In [None]:
dfd.head()

In [None]:
dfd.shape

In [None]:
dfd.info()

In [None]:
 dfd.describe()

In [None]:
dfd.columns

In [None]:
dfd.season.value_counts()

In [None]:
dfd.holiday.value_counts()

In [None]:
dfd.columns

# EDA

In [None]:
# season vs cnt
season_cnt_grpby = dfd.groupby('season')['cnt'].sum().reset_index()
season_cnt_grpby['season name'] = season_cnt_grpby['season'].map({1:'winter',2:'spring',3:'summer',4:'fall'})
print(season_cnt_grpby.sort_values(by='cnt',ascending=False))
sns.barplot(x=season_cnt_grpby['season name'],y=season_cnt_grpby.cnt,data=season_cnt_grpby)
plt.show()

In [None]:
# 
workingday_cnt_grpby = dfd.groupby('workingday')['cnt'].sum().reset_index()
workingday_cnt_grpby['Work_Day'] = workingday_cnt_grpby['workingday'].map({0:'No',1:'Yes'})
print(workingday_cnt_grpby.sort_values(by='cnt',ascending=False))
sns.barplot(x=workingday_cnt_grpby['Work_Day'],y=workingday_cnt_grpby['cnt'],data=workingday_cnt_grpby)
plt.ylabel('CNT')
plt.show()

In [None]:
# weathersit vs cnt
weathersit_cnt_grpby = dfd.groupby('weathersit')['cnt'].sum().reset_index()
weathersit_cnt_grpby['weather_condition'] = weathersit_cnt_grpby['weathersit'].map({
    1: "Clear or Partly Cloudy",
    2: "Mist or Cloudy",
    3: "Light Rain or Snow",
    4: "Heavy Rain or Snow"
})
print(weathersit_cnt_grpby.sort_values(by='cnt',ascending=False))
sns.barplot(x=weathersit_cnt_grpby['weather_condition'],y=weathersit_cnt_grpby['cnt'],data=weathersit_cnt_grpby)
plt.show()

In [None]:
# temp vs cnt
real_temp = dfd['temp'].apply(lambda x : (x*47)-8)
sns.scatterplot(x=real_temp,y=dfd['cnt'])
plt.title('Real Temp')
plt.show()

In [None]:
# temp vs cnt
real_atemp = dfd['atemp'].apply(lambda x : (x*66)-16)
sns.scatterplot(x=real_atemp,y=dfd['cnt'])
plt.title('Real ATemp')
plt.show()

In [None]:
melted = pd.melt(
    dfd,
    id_vars='workingday',               # Column to keep as-is
    value_vars=['casual', 'registered'],# Columns to "melt" (convert from wide → long)
    var_name='user_type',               # New column name for former column headers
    value_name='count'                  # New column name for actual values
)
sns.barplot(x='workingday', y='count', hue='user_type', data=melted)
plt.title("Casual vs Registered Rentals on Working and Non-Working Days")
plt.xlabel("Working Day (0 = No, 1 = Yes)")
plt.ylabel("Total Rentals")
plt.show()

In [None]:
# Statistical EDA

In [None]:
# lists for categoric and numeric
numeric = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered']
print('numeric cols: ',numeric)

In [None]:
categoric = [ col for col in dfd.columns if col not in numeric]
print(categoric)

In [None]:
dfd.index = pd.to_datetime(dfd.index)

In [1]:
# normality
normal=[]
non_normal=[]
uncertain=[]
normal_df = pingouin.normality(dfd[numeric])['pval']
normal_df
for col in numeric:
    p = normal_df[col]
    if (0.01<p<=0.05):
        uncertain.append(col)
    elif (p<=0.01):
        non_normal.append(col)
    else:
        normal.append(col)
print('normal cols:',normal)
print('non_normal:',non_normal)
print('uncertain:',uncertain)

NameError: name 'pingouin' is not defined

In [None]:
# skewness
fairly_skew = []
mod_positive = []
mod_negative = []
heavily_positive = []
heavily_negative = []
for col in numeric:
    _skew = dfd[col].skew()
    if (-0.5<_skew<0.5):
        fairly_skew.append(col)
    elif (0.5<=_skew<=1.0):
        mod_positive.append(col)
    elif (_skew>1.0):
        heavily_positive.append(col)
    elif (-1.0 <=_skew<-0.5):
        mod_negative.append(col)
    elif (_skew<-1.0):
        heavily_negative.append(col)
print('fairly_skew:',fairly_skew)
print('mod_positive',mod_positive)
print('mod_negative',mod_negative)
print('heavily_positive',heavily_positive)
print('heavily_negative',heavily_negative)

In [None]:
(dfd['windspeed']<=0).any()

In [None]:
# finding outliers iqr
non_normal_outliers = []
for col in numeric:
    q1 = np.quantile(dfd[col], 0.25)
    q3 = np.quantile(dfd[col], 0.75)
    iqr = q3 - q1
    ll = q1 - 1.5 * iqr
    ul = q3 + 1.5 * iqr

    outliers = dfd[(dfd[col] < ll) | (dfd[col] > ul)]
    outlier_ratio = len(outliers) / len(dfd[col])

    if (outlier_ratio < 0.05) and (len(outliers) > 0):
        non_normal_outliers.append(col)

print('Non-normal cols with outliers:', non_normal_outliers)

In [None]:
# cols for scaling
standard_scaling = []
minmax_scaling = []
for col in numeric:
    print(f'{col} {dfd[col].max()-dfd[col].min()}')

In [None]:
dfd.duplicated().sum()

In [None]:
dfd.isnull().sum()

# Custom Transformers

In [None]:
# duplicate handling

class duplicate_handler(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self,X,y=None):
        return self
        
    def set_output(self,transform=None):
        return self
        
    def transform(self,X,y=None):
        X = X.copy()
        X.drop_duplicates(inplace=True)
        X.reset_index(drop=True,inplace=True)
        return X

In [None]:
# square root transformation
class sqrt_transformation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def set_output(self, transform=None):
        return self

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in X.columns:
            X[col] = np.sqrt(X[col].clip(lower=0))
        return X

In [None]:
class iqr_outlier_handler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ll_ = {}
        self.ul_ = {}
        self.median_ = {}

    def set_output(self, transform=None):
        return self

    
    
    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            self.ll_[col] = q1 - 1.5 * iqr
            self.ul_[col] = q3 + 1.5 * iqr
            self.median_[col] = X[col].median()
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in self.ll_:
            ll = self.ll_[col]
            ul = self.ul_[col]
            median = self.median_[col]
            mask = (X[col] < ll) | (X[col] > ul)
            X[col] = X[col].astype(float)
            X.loc[mask, col] = median
        return X


In [None]:
# Column cleaner
class ColumnNameCleaner(BaseEstimator, TransformerMixin):
    def set_output(self, transform=None):
        return self
        
    def fit(self, X, y=None):
        self.columns_ = [col.split('__')[-1] for col in X.columns]
        return self

    def transform(self, X,y=None):
        X.columns = self.columns_
        return X

In [None]:
# class to clean column
class Column_Filterer(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
        
    def set_output(self,transform=None):
        return self
        
    def fit(self,X,y=None):
        return self
        
    def transform(self,X,y=None):
        col_permitted = ['season', 'yr', 'holiday', 'weekday', 'workingday', 'weathersit','temp', 'hum', 'windspeed']
        input_missing_cols = [ col for col in col_permitted if col not in X.columns ]
        if input_missing_cols:
            raise ValueError
        else:
            X = X[col_permitted]
            return X

# Split in X,y and train test

In [None]:
y = dfd['cnt']
X = dfd.drop(columns=['mnth', 'atemp', 'casual', 'registered','cnt'])

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Transformations list

In [None]:
#duplicate handling
dupicate_handling_list = numeric+categoric
#missing values numeric
missing_numeric_list = numeric
#missing values categoric
missing_categoric_list = categoric
#transformations
square_root_transformation_list = ['windspeed']
#iqr outliers
iqr_outliers = non_normal_outliers

In [None]:
metadata = {
'duplicate handling':numeric+categoric,
'missing values numeric':missing_numeric_list,
'missing values categoric':missing_categoric_list,
'transformations':square_root_transformation_list,
'iqr outliers':iqr_outliers
}

# Create transformation map

In [None]:
num_cols = metadata['missing values numeric']
cat_cols = metadata['missing values categoric']
outlier_cols = metadata['iqr outliers']
sqrt_cols = metadata['transformations']

In [None]:
transformation_map = defaultdict(list)
for col in numeric:
    step = ['missing_handle_numeric']
    if col in sqrt_cols:
        step.append('square_root_transformation')
    if col in outlier_cols:
        step.append('iqr_outlier_handling')
    step_key = '-> '.join(step)
    transformation_map[step_key].append(col)
for col in categoric:
    transformation_map['missing_handle_categoric'].append(col)

transformation_map = dict(transformation_map)

for k, v in transformation_map.items():
    print(f"{k}: {v}")

In [None]:
X_cols = set(X.columns)

for step_key in transformation_map:
    transformation_map[step_key] = [
        col for col in dict.fromkeys(transformation_map[step_key]) if col in X_cols
    ]

In [None]:
step_name_to_transformer = {
'missing_handle_numeric':SimpleImputer(strategy='median'),
'missing_handle_categoric':SimpleImputer(strategy='most_frequent'),
'square_root_transformation':sqrt_transformation(),
'iqr_outlier_handling':iqr_outlier_handler()
}

# step templates

In [None]:
step_templates = {}
for step in transformation_map.keys():
    step_list = step.split('->')
    step_list = [s.strip() for s in step.split('->')]
    pipeline_steps = []
    for step_name in step_list:
        if step_name not in step_name_to_transformer:
            raise ValueError(f"No transformer found for step: {step_name}")
        pipeline_steps.append((step_name,step_name_to_transformer[step_name]))
    step_templates[step] = Pipeline(pipeline_steps)

for step_key in step_templates:
    pipeline = step_templates[step_key]
    pipeline.steps.append(('clean_names', ColumnNameCleaner()))

In [None]:
step_templates

In [None]:
preprocessor = ColumnTransformer([
    (step_key, step_templates[step_key], cols)
    for step_key, cols in transformation_map.items()
])

In [None]:
preprocessor = preprocessor.set_output(transform="pandas")

In [None]:
preprocessor

In [None]:
multicollinear_cols = []
corr_ = dfd.corr().abs()
mask = np.triu(np.ones(corr_.shape),k=1).astype(bool)
corr_ = corr_.where(mask)
for col in corr_.columns:
    for row in corr_.index:
        if (corr_.loc[row,col]>0.9):
            multicollinear_cols.append((row,col,round(corr_.loc[row,col],5)))
print('multicollinear cols are:\n',multicollinear_cols)

In [None]:
xtrain_proc = preprocessor.fit_transform(xtrain, ytrain)
xtest_proc = preprocessor.transform(xtest)

In [None]:
models = [
    LinearRegression(),
    SVR(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    XGBRegressor()
]

for model in models:
    temp_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    temp_pipeline.fit(xtrain, ytrain)
    ypred = temp_pipeline.predict(xtest)
    
    mae = mean_absolute_error(ytest, ypred)
    mse = mean_squared_error(ytest, ypred)
    rmse = root_mean_squared_error(ytest, ypred)
    r2 = r2_score(ytest, ypred)
    adj_r2 = 1-(1-r2)*(len(ytest)-1)/(len(ytest)-xtrain.shape[1]-1)
    
    print(f"{model.__class__.__name__}")
    print(f"  MAE  : {mae:.2f}")
    print(f"  MSE  : {mse:.2f}")
    print(f"  RMSE : {rmse:.2f}")
    print(f"  R²   : {r2:.3f}")
    print(f"  Adj R²: {adj_r2:.3f}")
    print("----------------------------------------------------------")

In [None]:
# KNN regressor

In [None]:
# find optimal k
rmse_val = [] 
for K in range(1,20):
    
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(xtrain, ytrain)  #fit the model
    pred=model.predict(xtest) #make prediction on test set
    error = np.sqrt(mean_squared_error(ytest,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
print('rmse_val :',rmse_val)

In [None]:
# Lets plot the k-value and error rate
plt.figure(figsize=(10,6))
plt.plot(range(1,20),rmse_val,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('RMSE value vs. K Value')
plt.xticks(ticks=range(1, 20))  # Force x-axis ticks to show only integers
plt.ylabel('Error Rate')
plt.show()

In [None]:
knnr = KNeighborsRegressor(n_neighbors=9)
knnr.fit(xtrain_proc,ytrain)
knnr_ypred = knnr.predict(xtest_proc)

In [None]:
# Evaluate
mae = mean_absolute_error(ytest, knnr_ypred)
mse = mean_squared_error(ytest, knnr_ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest, knnr_ypred)
adj_r2 = 1 - (1 - r2) * (xtest_proc.shape[0] - 1) / (xtest_proc.shape[0] - xtest_proc.shape[1] - 1)

# Print results
print(f"MAE         : {mae:.2f}")
print(f"MSE         : {mse:.2f}")
print(f"RMSE        : {rmse:.2f}")
print(f"R²          : {r2:.3f}")
print(f"Adjusted R² : {adj_r2:.3f}")

In [None]:
gb_pipeline = Pipeline([
                        ('keep cols consistent',Column_Filterer()),
                        ('duplicate_remover',duplicate_handler()),
                        ('pre-processing_steps',preprocessor),
                        ('model',GradientBoostingRegressor(random_state=42))])
gb_pipeline.fit(xtrain,ytrain)
ypred_gb_pipeline = gb_pipeline.predict(xtest)
#evaluate
mae = mean_absolute_error(ytest, ypred_gb_pipeline)
mse = mean_squared_error(ytest, ypred_gb_pipeline)
rmse = np.sqrt(mse)
r2 = r2_score(ytest, ypred_gb_pipeline)
adj_r2 = 1 - (1 - r2) * (xtest.shape[0] - 1) / (xtest.shape[0] - xtest.shape[1] - 1)

print(f"MAE       : {mae:.2f}")
print(f"MSE       : {mse:.2f}")
print(f"RMSE      : {rmse:.2f}")
print(f"R²        : {r2:.3f}")
print(f"Adjusted R² : {adj_r2:.3f}")

In [None]:
param_grid = {
'model__n_estimators':[100,200,500],
'model__learning_rate':[0.001,0.01,0.1,0.2],
'model__max_depth':[2,3,4]
             }
gcv_gb_pipeline = GridSearchCV(estimator=gb_pipeline,param_grid=param_grid,cv=5,n_jobs=-1,verbose=0,scoring='neg_root_mean_squared_error')
gcv_gb_pipeline.fit(xtrain, ytrain)

In [None]:
# predict
ypred_gcv_gb_pipeline = gcv_gb_pipeline.predict(xtest)

In [None]:
mae = mean_absolute_error(ytest, ypred_gcv_gb_pipeline)
mse = mean_squared_error(ytest, ypred_gcv_gb_pipeline)
rmse = np.sqrt(mse)
r2 = r2_score(ytest, ypred_gcv_gb_pipeline)
adj_r2 = 1 - (1 - r2) * (xtest.shape[0] - 1) / (xtest.shape[0] - xtest.shape[1] - 1)
# Print metrics
print(f"MAE         : {mae:.2f}")
print(f"MSE         : {mse:.2f}")
print(f"RMSE        : {rmse:.2f}")
print(f"R²          : {r2:.3f}")
print(f"Adjusted R² : {adj_r2:.3f}")