# Dependencies

In [1]:
# Uncomment to install dependencies
! pip install sklearn
! pip install pandas
! pip install xgboost

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.0.2-cp37-cp37m-macosx_10_13_x86_64.whl (7.8 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting numpy>=1.14.6
  Using cached numpy-1.21.6-cp37-cp37m-macosx_10_9_x86_64.whl (16.9 MB)
Collecting scipy>=1.1.0
  Using cached scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl (33.0 MB)
Using legacy setup.py install for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, joblib, numpy, scipy, scikit-learn, sklearn
    Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed joblib-1.1.0 numpy-1.21.6 scikit-learn-1.0.2 scipy-1.7.3 sklearn-0.0 threadpoolctl-3.1.0
You should consider upgrading via the '/Users/abdurrafeymasood/Desktop/Coding/personal-projects/Bike-Sharing-Demand-Predict

# Imports

In [2]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb

In [65]:
data_target = pd.read_csv('../../data/train.csv')
data_target.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


# Data Processing

## Filtering

In [66]:
data_target['datetime'] = pd.to_datetime(
    data_target['datetime'],
    format='%Y-%m-%d %H:%M:%S', 
    errors = "coerce"
)

In [67]:
data_target.insert(1, 'year', data_target['datetime'].dt.year)
data_target.insert(2, 'month', data_target['datetime'].dt.month)
data_target.insert(3, 'day', data_target['datetime'].dt.day)
data_target.insert(4, 'hour', data_target['datetime'].dt.hour)
data_target.insert(5, 'second', data_target['datetime'].dt.second)

In [68]:
data_target.drop(columns=['casual', 'registered', 'datetime'], inplace=True)

In [69]:
data_target.head()

Unnamed: 0,year,month,day,hour,second,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011,1,1,0,0,1,0,0,1,9.84,14.395,81,0.0,16
1,2011,1,1,1,0,1,0,0,1,9.02,13.635,80,0.0,40
2,2011,1,1,2,0,1,0,0,1,9.02,13.635,80,0.0,32
3,2011,1,1,3,0,1,0,0,1,9.84,14.395,75,0.0,13
4,2011,1,1,4,0,1,0,0,1,9.84,14.395,75,0.0,1


## Train Test Split

In [70]:
X = data_target.loc[:, data_target.columns != 'count']
y = data_target['count']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Testing Models

## Random Forest

In [72]:
clf = RandomForestClassifier(random_state=0, n_estimators=100, max_depth=2)
clf.fit(X_train, y_train)


In [73]:
print(f'R2: {r2_score(y_test, clf.predict(X_test))}')
print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test))}')

R2: -1.1044383502355428
MSE: 69961.51101928375


## Linear Regression

In [74]:
clf = LinearRegression()
clf.fit(X_train, y_train)

In [75]:
print(f'R2: {r2_score(y_test, clf.predict(X_test))}')
print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test))}')

R2: 0.3743724594970941
MSE: 20798.82647261319


## KNN

In [76]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)

In [77]:
print(f'R2: {r2_score(y_test, clf.predict(X_test))}')
print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test))}')

R2: 0.350705869525197
MSE: 21585.616161616163


## XGBoost

In [145]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [146]:
param = {
    'max_depth':5, 
    'eta':0.8, 
    'objective':'reg:squaredlogerror' 
}

num_round = 100

clf = xgb.train(param, dtrain, num_round)

In [147]:
predictions = clf.predict(dtest)

In [148]:
print(f'R2: {r2_score(y_test, predictions)}')
print(f'MSE: {mean_squared_error(y_test, predictions)}')

R2: 0.25508943143947693
MSE: 24764.3600226611
