In [42]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

#### Define functions

In [116]:
def explore_df(df: DataFrame) -> None:
    print('Exploring field values summary...')
    print(df.describe())
    print('\nExploring field types...')
    print(df.info())
    print('\nExploring field correlation...')
    print(df.corr())
    # print(df.isna().sum())

def fix_col_types(df: DataFrame, has_hr: bool = False) -> DataFrame:
    cols = ['season', 'holiday', 'weekday', 'weathersit']
    if has_hr:
        cols.append('hr')
    print('Converting the following columns to categorical...', cols)
    for col in cols:
        df[col] = pd.Categorical(df[col])
    print(df.info())
    return df

def split_columns(df: DataFrame, has_hr: bool = False) -> tuple:
    features: DataFrame = df[['season', 'holiday', 'weekday', 'weathersit', 'atemp', 'hum', 'windspeed']]
    if has_hr:
        features = features.assign(hr = df['hr'])
    casual: DataFrame = df[['casual']]
    registered: DataFrame = df[['registered']]
    print('\nfeatures:', features.shape, ' casual:', casual.shape, ' registered:', registered.shape)
    return features, casual, registered

def mape(actual: DataFrame, predicted: DataFrame) -> float:
    mask = actual != 0
    return (np.fabs(actual - predicted) / actual)[mask].mean() * 100

def run_regr_for_model(_model, _x_train: DataFrame, _x_test: DataFrame, _y_train: DataFrame, _y_test: DataFrame) -> None:
    _model = _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)
    print('MSE: %.2f' %mean_squared_error(_y_test, _y_pred))
    print('MAPE: %.2f' %mape(_y_test, _y_pred), '%')

def run_all_models(x: DataFrame, y: DataFrame) -> None:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print('\nExecuting Linear Regression =>')
    lr = LinearRegression()
    run_regr_for_model(lr, x_train, x_test, y_train, y_test)

    print('\nExecuting KNN Regression =>')
    knn = KNeighborsRegressor()
    run_regr_for_model(knn, x_train, x_test, y_train, y_test)

## Day dataset
#### Load dataset & explore

In [113]:
days: DataFrame = pd.read_csv('./day.csv')
print(days.shape)
explore_df(days)

(731, 16)
Exploring field values summary...
          instant      season          yr        mnth     holiday     weekday  \
count  731.000000  731.000000  731.000000  731.000000  731.000000  731.000000   
mean   366.000000    2.496580    0.500684    6.519836    0.028728    2.997264   
std    211.165812    1.110807    0.500342    3.451913    0.167155    2.004787   
min      1.000000    1.000000    0.000000    1.000000    0.000000    0.000000   
25%    183.500000    2.000000    0.000000    4.000000    0.000000    1.000000   
50%    366.000000    3.000000    1.000000    7.000000    0.000000    3.000000   
75%    548.500000    3.000000    1.000000   10.000000    0.000000    5.000000   
max    731.000000    4.000000    1.000000   12.000000    1.000000    6.000000   

       workingday  weathersit        temp       atemp         hum   windspeed  \
count  731.000000  731.000000  731.000000  731.000000  731.000000  731.000000   
mean     0.683995    1.395349    0.495385    0.474354    0.62789

#### Fix datatype of columns & split the data into feature-set & target variables

In [117]:
days = fix_col_types(days)
# Splitting the data into feature-set & target variables
d_features, d_casual, d_registered = split_columns(days)

Converting the following columns to categorical... ['season', 'holiday', 'weekday', 'weathersit']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
instant       731 non-null int64
dteday        731 non-null object
season        731 non-null category
yr            731 non-null int64
mnth          731 non-null int64
holiday       731 non-null category
weekday       731 non-null category
workingday    731 non-null int64
weathersit    731 non-null category
temp          731 non-null float64
atemp         731 non-null float64
hum           731 non-null float64
windspeed     731 non-null float64
casual        731 non-null int64
registered    731 non-null int64
cnt           731 non-null int64
dtypes: category(4), float64(4), int64(7), object(1)
memory usage: 69.2+ KB
None

features: (731, 7)  casual: (731, 1)  registered: (731, 1)


#### Running models for 'casual' target variable

In [118]:
run_all_models(d_features, d_casual)

(584, 7) (147, 7) (584, 1) (147, 1)

Executing Linear Regression =>
MSE: 240679.10
MAPE: 99.35 %

Executing KNN Regression =>
MSE: 183438.66
MAPE: 81.81 %


#### Running models for 'registered' target variable

In [119]:
run_all_models(d_features, d_registered)

(584, 7) (147, 7) (584, 1) (147, 1)

Executing Linear Regression =>
MSE: 1345423.08
MAPE: 42.10 %

Executing KNN Regression =>
MSE: 1494425.71
MAPE: 44.86 %


## Hour dataset
#### Load dataset & explore

In [120]:
hours: DataFrame = pd.read_csv('./hour.csv')
print(hours.shape)
explore_df(hours)

(17379, 17)
Exploring field values summary...
          instant        season            yr          mnth            hr  \
count  17379.0000  17379.000000  17379.000000  17379.000000  17379.000000   
mean    8690.0000      2.501640      0.502561      6.537775     11.546752   
std     5017.0295      1.106918      0.500008      3.438776      6.914405   
min        1.0000      1.000000      0.000000      1.000000      0.000000   
25%     4345.5000      2.000000      0.000000      4.000000      6.000000   
50%     8690.0000      3.000000      1.000000      7.000000     12.000000   
75%    13034.5000      3.000000      1.000000     10.000000     18.000000   
max    17379.0000      4.000000      1.000000     12.000000     23.000000   

            holiday       weekday    workingday    weathersit          temp  \
count  17379.000000  17379.000000  17379.000000  17379.000000  17379.000000   
mean       0.028770      3.003683      0.682721      1.425283      0.496987   
std        0.167165    

#### Fix datatype of columns & split the data into feature-set & target variables

In [121]:
hours = fix_col_types(hours, True)
# Splitting the data into feature-set & target variables
h_features, h_casual, h_registered = split_columns(hours, True)

Converting the following columns to categorical... ['season', 'holiday', 'weekday', 'weathersit', 'hr']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
instant       17379 non-null int64
dteday        17379 non-null object
season        17379 non-null category
yr            17379 non-null int64
mnth          17379 non-null int64
hr            17379 non-null category
holiday       17379 non-null category
weekday       17379 non-null category
workingday    17379 non-null int64
weathersit    17379 non-null category
temp          17379 non-null float64
atemp         17379 non-null float64
hum           17379 non-null float64
windspeed     17379 non-null float64
casual        17379 non-null int64
registered    17379 non-null int64
cnt           17379 non-null int64
dtypes: category(5), float64(4), int64(7), object(1)
memory usage: 1.6+ MB
None

features: (17379, 8)  casual: (17379, 1)  registered: (17379, 1)


#### Running models for 'casual' target variable

In [122]:
run_all_models(h_features, h_casual)

(13903, 8) (3476, 8) (13903, 1) (3476, 1)

Executing Linear Regression =>
MSE: 1640.89
MAPE: 284.98 %

Executing KNN Regression =>
MSE: 655.70
MAPE: 87.57 %


#### Running models for 'registered' target variable

In [123]:
run_all_models(h_features, h_registered)


(13903, 8) (3476, 8) (13903, 1) (3476, 1)

Executing Linear Regression =>
MSE: 16774.86
MAPE: 334.86 %

Executing KNN Regression =>
MSE: 5181.20
MAPE: 51.09 %
