In [18]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, TargetEncoder
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [19]:
# !wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
# !unzip bike+sharing+dataset.zip

In [20]:
df = pd.read_csv('hour.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [21]:
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [22]:
df.shape

(17379, 17)

In [23]:
mlflow.set_tracking_uri("http://localhost:5000")

In [24]:
dataset_description = {
    "dataset_name": "Bike Sharing Dataset",
    "num_rows": 17379,  # Example
    "num_features": 18  # Example
}

In [25]:
def feature_engineering(df):

    df = df.drop(['instant', 'casual', 'registered'], axis=1)


    df['dteday'] = pd.to_datetime(df.dteday)

    df['season'] = df.season.astype('category')
    df['holiday'] = df.holiday.astype('category')
    df['weekday'] = df.weekday.astype('category')
    df['weathersit'] = df.weathersit.astype('category')
    df['workingday'] = df.workingday.astype('category')
    df['mnth'] = df.mnth.astype('category')
    df['yr'] = df.yr.astype('category')


    df['hour_temp_combination'] = df['hr'] * df['temp']
    df['week_of_year'] = df['dteday'].dt.isocalendar().week
    df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
    df['hour_categorical'] = pd.cut(df['hr'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])
    df['temp_hum_combination'] = df['temp'] * df['hum']

    df = df.drop(columns=['dteday'])

    return df

In [26]:
def filter_columns(filter_list, df):
    return df.filter(items=filter_list)

In [27]:
def plot_correlation_matrix_num_features(df):

    numerical_cols = df.select_dtypes(include=['float64', 'int64','UInt32']).columns
    corr_matrix = df[numerical_cols].corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.show()

In [28]:
def input_output_features(df):

    X = df.drop(columns=['cnt']) # Features
    y = df['cnt'] # Target

    return X,y

In [29]:

def data_preprocessing(X, numerical_features, categorical_features, y):


    # Numerical features
    numerical_pipeline = Pipeline([
                                    ('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
                                    ('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
                                  ])

    X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

    # Categorical features for One Hot Encoding
    categorical_pipeline = Pipeline([
                                      ('imputer', SimpleImputer(strategy='most_frequent')),
                                      ('onehot', OneHotEncoder(sparse_output=False, drop='first'))
                                    ])
    
    # Categorical features for Target Encoding
    # categorical_pipeline = Pipeline([
    #                                   ('imputer', SimpleImputer(strategy='most_frequent')),
    #                                   ('target_encoder', TargetEncoder(smooth="auto"))
    #                                 ])

    # Transforming above OHE
    X_encoded = categorical_pipeline.fit_transform(X[categorical_features])
    X_encoded = pd.DataFrame(X_encoded,columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features))
    # Encoded categorical features + Numerical features
    X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

    # Transforming categorical features using Target Encoder
    # X.columns = X.columns.astype(str)
    # X_encoded = categorical_pipeline.fit_transform(X[categorical_features], y=y)
    # print(X_encoded.shape)
    # X_encoded = pd.DataFrame(categorical_pipeline.named_steps['target_encoder'].get_feature_names_out(categorical_features))
    # X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)
    # X = X.drop(columns=0)
    # print(X.columns)
    return X, numerical_pipeline, categorical_pipeline


In [30]:
def evaluation_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [31]:
class LinearRegressionFromScratch:

    def __init__(self, learning_rate, n_iterations):
        
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.bias = None
        self.weights = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent iterations
        for _ in range(self.n_iterations):
            # Predict using current weights
            y_predicted = X @ self.weights + self.bias

            # Calculate gradients
            dw = (1 / n_samples) * X.T @ (y_predicted - y)
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        print(self.weights, self.bias)
        return X @ self.weights + self.bias
        

In [32]:
df = feature_engineering(df)

filter_list = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
                'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt',
                'hour_temp_combination', 'week_of_year', 'day_night',
                 'temp_hum_combination'] #'hour_categorical','day_night'

filtered_df = filter_columns(filter_list, df)
print(filtered_df.columns)

# plot_correlation_matrix_num_features(df)

X,y = input_output_features(filtered_df)


numerical_features = ['temp', 'hum','windspeed']
categorical_features = ['season', 'weathersit', 'day_night']
X, numerical_pipeline, categorical_pipeline = data_preprocessing(X, numerical_features, categorical_features,y )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


# Start an MLflow experiment
with mlflow.start_run(run_name="Linear Regressor Scratch Model OneHot encoding and old features"):

    for key, value in dataset_description.items():
        mlflow.log_param(key, value)

    # model = RandomForestRegressor(n_estimators=100, random_state=42)
    model = LinearRegressionFromScratch(learning_rate=0.0001, n_iterations=500)
    # model = LinearRegression()
    model.fit(X_train, y_train)

    #### Linear Regression ########
    # params = {
    #             "fit_intercept": model.fit_intercept
    # }

    # # Log Parameters
    # mlflow.log_param("fit_intercept", model.fit_intercept)

    #### Random Forest Regression ########
    # params = {
    # "n_estimators": model.n_estimators,
    # "max_depth": model.max_depth,
    # "min_samples_split": model.min_samples_split,
    # "min_samples_leaf": model.min_samples_leaf,
    # "max_features": model.max_features,
    # "bootstrap": model.bootstrap,
    # "random_state": model.random_state
    # }
    # # Log Parameters
    # mlflow.log_param("n_estimators", model.n_estimators)
    # mlflow.log_param("max_depth", model.max_depth)
    # mlflow.log_param("min_samples_split", model.min_samples_split)
    # mlflow.log_param("min_samples_leaf", model.min_samples_leaf)
    # mlflow.log_param("max_features", model.max_features)
    # mlflow.log_param("bootstrap", model.bootstrap)
    # mlflow.log_param("random_state", model.random_state)

    y_pred = model.predict(X_test)
    print("Number of NaN in y_test:", np.isnan(y_test).sum())
    print(y_pred)

    mse, r2 = evaluation_metrics(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Log the model
    mlflow.sklearn.log_model(model, "model")

    print(f"Model saved in run {mlflow.active_run().info.run_uuid}")

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt',
       'hour_temp_combination', 'week_of_year', 'day_night',
       'temp_hum_combination'],
      dtype='object')
yr                       1.399683
mnth                     0.939368
hr                       6.890845
holiday                 -0.036714
weekday                   1.70623
workingday               0.441422
temp                     0.655328
atemp                    0.593254
hum                      -0.11988
windspeed                  0.1951
hour_temp_combination    9.556088
week_of_year             1.426851
temp_hum_combination     0.150071
season_2                 0.543909
season_3                 0.383733
season_4                -0.174228
weathersit_2             0.023242
weathersit_3            -0.274426
weathersit_4            -0.000314
day_night_night         -1.686037
dtype: object 0.4941820827598714
Number of NaN in y_test: 0
12830  

2024/08/27 17:18:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run Linear Regressor Scratch Model OneHot encoding and old features at: http://localhost:5000/#/experiments/0/runs/107c65b34c74482dbf74bf0cd20c5a46.
2024/08/27 17:18:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/0.


Model saved in run 107c65b34c74482dbf74bf0cd20c5a46


In [34]:
# Final Pipeline

final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', LinearRegressionFromScratch(learning_rate=0.0001, n_iterations=500))
])

In [35]:
final_pipeline