# STILL IN PROGRESS... estimated completion date: 04/20/2023

# BIKE SHARING DEMAND

__Data Fields__ <br>
__datetime__ - hourly date + timestamp <br>
__season__ -  1 = spring, 2 = summer, 3 = fall, 4 = winter <br> 
__holiday__ - whether the day is considered a holiday <br>
__workingday__ - whether the day is neither a weekend nor holiday <br>
__weather__ - 1: Clear, Few clouds, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog <br>
__temp__ - temperature in Celsius <br>
__atemp__ - "feels like" temperature in Celsius <br>
__humidity__ - relative humidity <br>
__windspeed__ - wind speed <br>
__casual__ - number of non-registered user rentals initiated <br>
__registered__ - number of registered user rentals initiated <br>
__count__ - number of total rentals

In [None]:
# inporting useful librabries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mtl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') 
%matplotlib inline

In [None]:
# setting custom color palettes http://colorbrewer2.org/

blue, ach = ['#4393c3'], ['#4d4d4d']
div1 = ['#543005','#8c510a','#bf812d','#dfc27d','#f6e8c3','#c7eae5','#80cdc1','#35978f','#01665e','#003c30']
div2 = ['#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#e0e0e0','#bababa','#878787','#4d4d4d','#1a1a1a']

In [None]:
# loading dataset

bikedf = pd.read_csv('./dataset/train.csv', sep=r',', parse_dates=['datetime'])
bikedf.head()

In [None]:
# accessing information about our train dataset

bikedf.info()

### Dealing with datetime column
Converting the datetime columns into its components `[year, month, day, dayofweek, hour]`

In [None]:
def datetransform(df):
    """ Function Name: datetransform
            This function transforms (and replace) the datetime column of a dataframe
            into 5 separate components [year, month, day, dayofweek, hour].
    """
    
    dtime_df = pd.DataFrame()
    
    dtime_df['year'] = df['datetime'].dt.year
    dtime_df['month'] = df['datetime'].dt.month
    dtime_df['day'] = df['datetime'].dt.day # day on the calendar
    dtime_df['dayofweek'] = df['datetime'].dt.dayofweek
    dtime_df['hour'] = df['datetime'].dt.hour
    df.drop(columns='datetime', inplace=True)
    
    transformed_df = dtime_df.join(df)
    return transformed_df

In [None]:
# transforming datetime column in dataset

bikedf = datetransform(bikedf)
bikedf.head()

### DATA EXPLORATION AND PREPROCESSING

In [None]:
# making a copy of the train dataset (to maintain its integrity)
eda_df = bikedf.copy()

# Description of features
cols_description = ['hourly date and timestamp', 'Current Season', 'Day is holiday or not', 
                    'Day is working day or not', 'Current weather', 'Temperature in Celsius',
                    'Feels like temperature in Celsius', 'Relative humidity', 'Wind speed',
                    'Number of non-registered user rentals', 'Number of registed user rentals',
                    'Total number of rentals']

# continuous features
cont_cols = eda_df.columns.to_list()[9:-3] #['temp', 'atemp', 'humidity', 'windspeed']
print(f'Continuous Features: {cont_cols}')

# categorical features
cat_cols = eda_df.columns.to_list()[:9] #['year', 'month', 'day', 'dayofweek', 'hour', 'season', 'holiday', 'workingday', 'weather']
print(f'Categorical features: {cat_cols}')

# target features: casual, registered, count*
targets = eda_df.columns.to_list()[-3:]
print(f'Target features: {targets}')

In [None]:
# correlation between continuous features and count

numcols = cont_cols + targets
corr_mat = np.around(eda_df[numcols].corr(), 3)
plt.figure(figsize=(10,7))
sns.heatmap(corr_mat, annot=True)
plt.show() # interesting features: temp, humidity, casual*, registered*
plt.clf()

In [None]:
# correlation between categorical features and count

catcols = cat_cols + targets
corr_mat = np.around(eda_df[catcols].corr(), 3)
plt.figure(figsize=(12,7))
sns.heatmap(corr_mat, annot=True)
plt.show() # interesting features: year, hour, season, weather
plt.clf()

In [None]:
# # converting binary* description to actual representation

# months = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
# weekdays = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
# seasons = {1:'spring', 2:'summer', 3:'fall', 4:'winter'}
# weather = {1:'clear', 2:'cloudy', 3:'light rain', 4:'snowy'}
# eda_df['month'] = eda_df['month'].map(months)
# eda_df['dayofweek'] = eda_df['dayofweek'].map(weekdays)
# eda_df['season'] = eda_df['season'].map(seasons)
# eda_df['weather'] = eda_df['weather'].map(weather)

# eda_df.head()

In [None]:
# Exploring the seasonal statistics of ['temp', 'atemp', 'humidity', 'windspeed']

for feature in cont_cols:
    print(eda_df.groupby('season')[[feature]].agg(['min','mean','std','max']))
    print("")

In [None]:
# Exploring the yearly statistics of ['temp', 'atemp', 'humidity', 'windspeed']

for feature in cont_cols:
    print(eda_df.groupby('year')[[feature]].agg(['min','mean','std','max']))
    print("")

In [None]:
# distribution of casual, registered, and count

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15,8))
for i in range(3):
    sns.histplot(data=eda_df, bins=15, x=targets[i], ax=axs[i], kde=True)
fig.suptitle('Distribution of Number of Bike retals')
plt.show()
plt.clf()

In [None]:
# scatter plot of continuous features against target features

for feature in cont_cols:
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15,8))
    sns.scatterplot(data=eda_df, x=feature, y=targets[0], ax=axs[0])
    sns.scatterplot(data=eda_df, x=feature, y=targets[1], ax=axs[1], color=ach)
    fig.suptitle(f'Scatter plot of {feature.title()}')
    plt.show()
plt.clf()

In [None]:
# bar chart of categorical features
# exploring the trend of bike rental in both years (2011 and 2012)

features_of_interest = ['month', 'hour', 'season', 'weather', 'holiday']
for feature in features_of_interest:
    fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(15,10))
    sns.barplot(data=eda_df, x=feature, y=targets[0], hue='year', palette=blue+ach, ax=axs[0], errorbar=None)
    sns.barplot(data=eda_df, x=feature, y=targets[1], hue='year', palette=blue+ach, ax=axs[1], errorbar=None)
    fig.suptitle(f'Trend of bike rentals per {feature.title()}')
    plt.show()
plt.clf()

In [None]:
# trend of bike sharing during the days of the week when its a holiday

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(15,10))
sns.barplot(data=eda_df, x='dayofweek', y=targets[0], hue='holiday', palette=blue+ach, ax=axs[0], errorbar=None)
sns.barplot(data=eda_df, x='dayofweek', y=targets[1], hue='holiday', palette=blue+ach, ax=axs[1], errorbar=None)
fig.suptitle(f'Trend of bike rentals per {feature.title()}')
plt.show()
plt.clf()

In [None]:
# trend of bike sharing during the days of the week when its a working day

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(15,10))
sns.barplot(data=eda_df, x='dayofweek', y=targets[0], hue='workingday', palette=blue+ach, ax=axs[0], errorbar=None)
sns.barplot(data=eda_df, x='dayofweek', y=targets[1], hue='workingday', palette=blue+ach, ax=axs[1], errorbar=None)
fig.suptitle(f'Trend of bike rentals per {feature.title()}')
plt.show()
plt.clf()

In [None]:
# number of bike rentals based on the season and weather

group_season_weather = eda_df.groupby(['season','weather'])[['casual','registered']].sum()
group_season_weather.plot.bar(figsize=(15,8), color=blue+ach)
plt.legend()
plt.show()
plt.clf()

In [None]:
# investigating the trend of rent during day of week and time by casual and registered bikers
palette = ['#8c510a','#4d4d4d','#542788','#f4a582','#4393c3','#1a9850','#fee08b']

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(15,10))
sns.lineplot(data=eda_df, x='hour', y=targets[0], linewidth=2.5, palette=palette, hue='dayofweek', ax=axs[0], err_style=None)
sns.lineplot(data=eda_df, x='hour', y=targets[1], linewidth=2.5, palette=palette, hue='dayofweek', ax=axs[1], err_style=None)
fig.suptitle(f'Hourly bike rental given {feature.title()}')
plt.show()
plt.clf()

In [None]:
# investigatin the variations of the hour the bike were rented with respect to the target features
hour = eda_df['hour']

fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(15,10))
sns.boxplot(data=eda_df, x=hour, y=targets[0], ax=axs[0], palette=div2+div1)
sns.boxplot(data=eda_df, x=hour, y=targets[1], ax=axs[1], palette=div2+div1)
fig.suptitle('Box plot of each hour bike was rented')
plt.show()
plt.clf()

#### Data Qualitty Report
__Continuous Features__ <br>
```['temp', 'atemp', 'humidity', 'windspeed']```

In [None]:
# Description of continuous features
cont_description = ['Temperature in Celsius', 'Feels like temperature in Celsius',
                    'Relative humidity', 'Wind speed', 'Number of non-registered user rentals',
                    'Number of registed user rentals', 'Total number of rentals']

continuous_features_dqr = pd.DataFrame()

for idx, feature in enumerate(cont_cols):
    cont_dqr = pd.DataFrame()
    cont_dqr['Features'] = [feature]
    cont_dqr['Description'] = [cont_description[idx]]
    cont_dqr['Count'] = [eda_df[feature].shape[0]]
    cont_dqr['% of Missing'] = [eda_df[feature].isnull().sum() / eda_df[feature].shape[0]]
    cont_dqr['Card.'] = [eda_df[feature].nunique()]
    cont_dqr['Min #'] = [eda_df[feature].min()]
    cont_dqr['Q1'] = [eda_df[feature].describe()[4]]
    cont_dqr['Median'] = [eda_df[feature].median()]
    cont_dqr['Q3'] = [eda_df[feature].describe()[6]]
    cont_dqr['Max #'] = [eda_df[feature].max()]
    cont_dqr['Std. Dev.'] = [eda_df[feature].std()]
    continuous_features_dqr = pd.concat([continuous_features_dqr, cont_dqr],
                                         axis=0, ignore_index=True)
    
continuous_features_dqr

__Categorical Features__ <br>
```['year', 'month', 'day', 'dayofweek', 'hour', 'season', 'holiday', 'workingday', 'weather']```

In [None]:
# Description of categorical features
cat_description = ['Year of rental', 'Month of rental', 'Day of rental', 'Day of week',
                   'Hour of day', 'Current Season', 'Day is holiday or not', 
                    'Day is working day or not', 'Current weather']

categorical_features_dqr = pd.DataFrame()

for idx, feature in enumerate(cat_cols):
    cat_dqr = pd.DataFrame()
    cat_dqr['Features'] = [feature]
    cat_dqr['Description'] = [cat_description[idx]]
    cat_dqr['Count'] = [eda_df[feature].shape[0]]
    cat_dqr['% of Missing'] = [eda_df[feature].isnull().sum() / eda_df[feature].shape[0]]
    cat_dqr['Card.'] = [eda_df[feature].nunique()]
    cat_dqr['1st Mode'] = [eda_df[feature].mode(dropna=True)[0]]
    cat_dqr['1st Mode Freq.'] = [eda_df[eda_df[feature] == cat_dqr['1st Mode'][0]].shape[0]]
    cat_dqr['1st Mode %'] = np.around(cat_dqr['1st Mode Freq.'] / cat_dqr['Count'] * 100, 2)
    cat_dqr['2nd Mode'] = [eda_df[eda_df[feature] != cat_dqr['1st Mode'][0]][feature].mode(dropna=True)[0]]
    cat_dqr['2nd Mode Freq.'] = [eda_df[eda_df[feature] == cat_dqr['2nd Mode'][0]].shape[0]]
    cat_dqr['2nd Mode %'] = np.around(cat_dqr['2nd Mode Freq.'] / cat_dqr['Count'] * 100, 2)
    categorical_features_dqr = pd.concat([categorical_features_dqr, cat_dqr],
                                         axis=0, ignore_index=True)
    
categorical_features_dqr

### Dealing with Missing Values and Outliers

__Missing Values__

In [None]:
# checking for missing values in the eda dataset
eda_df.isnull().any()

__Outliers__

In [None]:
# investigating and removing potential outliers in the dataset

def remove_outlier(df):
    """ Function Name: remove outliers
            This function takes in a dataframe and removes
            top and bottom k% outliers in count and also
            take out dangerous windspeed (>29mph)
    """
    # At 30mph or, the wind makes cycling quite difficult,
    # even for the more experienced cyclist.
    windtresh  = 29
    tresh = df['windspeed'] > windtresh
    df = df.drop(index=df[tresh].index)

    k = 2
    lower = df['count'] < df['count'].quantile(k/100)
    upper = df['count'] > df['count'].quantile(1-k/100)
    df = df.drop(index=df[lower].index)
    df = df.drop(index=df[upper].index)
        
    return df

outlier_removed = remove_outlier(eda_df)
print(eda_df.shape, outlier_removed.shape)

In [None]:
""" UNCOMMENT ME """

# # visualizing the change in distribution for the continuous columns

# for feature in numcols:
#     fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15,8))
#     sns.histplot(data=eda_df, x=feature, bins=20, kde=True, ax=axs[0])
#     sns.histplot(data=outlier_removed, x=feature, bins=20, kde=True, ax=axs[1])
#     plt.show()
# plt.clf()


# # visualizing the change in variation for the categorical columns

# for feature in cat_cols:
#     fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15,8))
#     sns.boxplot(data=eda_df, x=feature, y=targets[2], ax=axs[0], palette=div2+div1)
#     sns.boxplot(data=outlier_removed, x=feature, y=targets[2], ax=axs[1], palette=div2+div1)
#     plt.show()
# plt.clf()

### Normalizing Continuours Features

In [None]:
def normalize(df, cols):
    """ Function Name: Normalize
            This function takes a dataframe and specified numerical
            columns and transforms them between [0, 1]
    """
    for feature in cols:
        minval, maxval = df[feature].min(), df[feature].max()
        df[feature] = ( df[feature] - minval ) / ( maxval - minval )

    return df

eda_df = normalize(outlier_removed, cont_cols)
eda_df.head()

### Feature Selection and Transformation

__Selection__ <br>
`keep features: {atemp, humidity, year, hour, season, workingday, weather}`

In [None]:
keep = ['year', 'hour', 'season', 'workingday', 'weather', 'atemp', 'humidity', 'count']
eda_df = eda_df[keep]
eda_df.head()

__Transformation__

In [None]:
# creating dummies for categorical features

dummy = ['year', 'hour', 'season', 'workingday', 'weather']
eda_df[dummy] = eda_df[dummy].astype('object')
eda_df = pd.get_dummies(eda_df, columns=dummy, drop_first=True)
eda_df.head()

In [None]:
# binning the target feature into 4 bins (1. using the iqr | 2. using equal width binning)
# 1: few; 2: okay; 3: enough; 4: a_lot

eda_iqr = eda_df.copy()
eda_bin = eda_df.copy()

def bin_target(df, target='count'):
    temp = df[target].to_numpy()
    new = []
    lower, upper = np.min(temp), np.max(temp)
    inc = (upper - lower) / 4

    for val in temp:
        if val <= inc :
            new.append(1)
        elif val <= 2*inc :
            new.append(2)
        elif val <= 3*inc :
            new.append(3)
        else:
            new.append(4)
    df[target] = new

    return df

eda_bin = bin_target(eda_bin, 'count')

### MODEL SELECTION AND EVALUATION
Data sets used: `eda_df, eda_bin, eda_iqr`

In [None]:
# inporting useful librabries and test data set

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix, r2_score, mean_squared_log_error

In [None]:
X = eda_df.drop(columns='count')
y = eda_df['count']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.70, random_state=11)
print(Xtest.shape, Xtrain.shape)

for n in range(15,21):
    dtrmdl = DecisionTreeRegressor(criterion='squared_error' , max_depth=n, random_state=11)
    dtrmdl.fit(Xtrain, ytrain)
    print(n, ':', dtrmdl.score(Xtest, ytest), dtrmdl.score(Xtrain, ytrain))

In [None]:
X = eda_bin.drop(columns='count')
y = eda_bin['count']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.70, random_state=11)
print(Xtest.shape, Xtrain.shape)

for n in range(15,21):
    dtclf = DecisionTreeClassifier(criterion='entropy', max_depth=n, random_state=11)
    dtclf.fit(Xtrain, ytrain)
    ypred = dtclf.predict(Xtest)
    print(n, ':', accuracy_score(ytest, ypred))

# confusion matrix
conf_metrix = confusion_matrix(ytest, ypred)
conf_metrix_display = ConfusionMatrixDisplay(confusion_matrix=conf_metrix, display_labels=['few', 'okay', 'enough', 'alot'])
conf_metrix_display.plot()
plt.show()