In [97]:
# FOR GOOGLE COLAB
# !wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
# !unzip bike+sharing+dataset.zip

--2024-08-27 13:44:55--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip.1’

bike+sharing+datase     [  <=>               ] 273.43K   961KB/s    in 0.3s    

2024-08-27 13:44:56 (961 KB/s) - ‘bike+sharing+dataset.zip.1’ saved [279992]



In [209]:
import pandas as pd

In [210]:
df=pd.read_csv('./hour.csv')
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)

#Catagories
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df['day_night'] = df.day_night.astype('category')

df.drop(columns=['dteday'], inplace=True)

In [211]:
# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

In [212]:
# PIPELINE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn import set_config

Question 1

In [213]:
# FEATURE ENGINEERING (Commented because performance was deteriorated after feature engineering, hence procceding with original features)
# X['wind_temp'] = X['windspeed']*X['temp']
# X['hum_atemp'] = X['hum'] * X['atemp']
# X.drop(['windspeed', 'atemp', 'hum', 'temp'], axis=1, inplace=True)

In [214]:
# Numerical features
#numerical_features = ['wind_temp', 'hum_atemp']
numerical_features = ['windspeed', 'atemp', 'hum', 'temp']
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])
# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

In [215]:
# Categorical features(ONE HOT ENCODING)
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])
# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features])
# Converting it to a dataframe

X_encoded = pd.DataFrame(X_encoded,
columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features))
# Encoded categorical features + Numerical features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

Question 2

In [187]:
# Categorical features(ONE HOT ENCODING)
categorical_features = ['season', 'weathersit', 'day_night']

X_encoded = df.copy()

# mean target value for each category in the categorical features
target_means = {}
for col in categorical_features:
    means = X_encoded.groupby(col)[y.name].mean()
    target_means[col] = means

# target encoding
for col in categorical_features:
    X_encoded[col] = X_encoded[col].map(target_means[col])

# for missing values 
imputer = SimpleImputer(strategy='mean')
X_encoded[categorical_features] = imputer.fit_transform(X_encoded[categorical_features])

# Combining
X_final = pd.concat([X_encoded.drop(columns=categorical_features), X_encoded[categorical_features]], axis=1)
X_final = pd.DataFrame(X_final)

In [216]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [217]:
# For X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [218]:
# For X
model = LinearRegression()
model.fit(X_train, y_train)

# For X_target_encoded
#model_encoded = LinearRegression()
#model_encoded.fit(X_train_encoded, y_train_encoded)

In [219]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 15387.412761263442
R-squared: 0.526788819252584


### COMPARISON RESULTS
* Before Feature Engineering
Mean Squared Error: 14896.150620843286
R-squared: 0.5295765950245785

* After feature engineering
Mean Squared Error: 15759.859746064763
R-squared: 0.5023004887379983

--
* One Hot Encoding
Mean Squared Error: 14896.150620843286
R-squared: 0.5295765950245785

* Target Encoding
Mean Squared Error: 13896.148748489446
R-squared: 0.5587181786464568

In [256]:
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('target_encoder', ce.TargetEncoder())
])

final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', LinearRegression())
])

In [257]:
# One Hot Encoder
from sklearn import set_config
set_config(display='diagram')# To display
display(final_pipeline)

In [220]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

def train_linear_regressor(X, y):
    # Convert to numpy arrays if X is a DataFrame
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values
    
    X = np.c_[np.ones(X.shape[0]), X]
    X_transpose = X.T
    beta = np.linalg.inv(X_transpose @ X) @ X_transpose @ y
    y_pred = X @ beta
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    return {
        'Mean Squared Error': mse,
        'R-squared': r2
    }


In [223]:
train_linear_regressor(X,y)

{'Mean Squared Error': 15428.575004076132, 'R-squared': 0.5310401937043101}

In [236]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def train_linear_regressor(X_train, y_train):
    if isinstance(X_train, pd.DataFrame):
        X_train = X_train.values
    if isinstance(y_train, pd.Series):
        y_train = y_train.values
    
    X_train = np.c_[np.ones(X_train.shape[0]), X_train]

    X_transpose = X_train.T
    beta = np.linalg.inv(X_transpose @ X_train) @ X_transpose @ y_train
    
    return beta

def evaluate_linear_regressor(X_test, y_test, beta):
    if isinstance(X_test, pd.DataFrame):
        X_test = X_test.values
    if isinstance(y_test, pd.Series):
        y_test = y_test.values
    X_test = np.c_[np.ones(X_test.shape[0]), X_test]
    y_pred = X_test @ beta
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'Mean Squared Error': mse,
        'R-squared': r2
    }

def linear_rsr(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    beta = train_linear_regressor(X_train, y_train)
    metrics = evaluate_linear_regressor(X_test, y_test, beta)
    print(f"Mean Squared Error: {metrics['Mean Squared Error']}")
    print(f"R-squared: {metrics['R-squared']}")

In [237]:
linear_rsr(X,y)

Mean Squared Error: 14896.15062084341
R-squared: 0.5295765950245745
