In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from datetime import datetime

In [26]:
df=pd.read_csv('/Users/zabihbuda/Desktop/York Universiy files/Data Science /schulich_data_science_1/bikes_sharing.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [27]:
df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

## Data Engineering


In [28]:
df['datetime']=pd.to_datetime(df['datetime'])
df['year']=df['datetime'].dt.year
df['month']=df['datetime'].dt.month
df['week_days']=df['datetime'].dt.dayofweek
df['count_cat']=df['count'].apply(lambda x: 'low' if 0< x<300 else 'medium' if 300< x < 600 else 'high' )

In [29]:
df.describe()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,week_days
count,10886,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2011-12-27 05:56:22.399411968,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132,2011.501929,6.521495,3.013963
min,2011-01-01 00:00:00,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0,2011.0,1.0,0.0
25%,2011-07-02 07:15:00,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0,2011.0,4.0,1.0
50%,2012-01-01 20:30:00,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0,2012.0,7.0,3.0
75%,2012-07-01 12:45:00,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0,2012.0,10.0,5.0
max,2012-12-19 23:00:00,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0,2012.0,12.0,6.0
std,,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454,0.500019,3.444373,2.004585


In [30]:
df['weather'].value_counts()

weather
1    7192
2    2834
3     859
4       1
Name: count, dtype: int64

In [31]:
df=df[df['weather']!=4]
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,week_days,count_cat
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,5,low
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,5,low
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,5,low
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,5,low
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,5,low


In [32]:
df['windspeed'].value_counts()

windspeed
0.0000     1313
8.9981     1120
11.0014    1057
12.9980    1042
7.0015     1034
15.0013     961
6.0032      871
16.9979     824
19.0012     676
19.9995     492
22.0028     372
23.9994     274
26.0027     235
27.9993     187
30.0026     111
31.0009      89
32.9975      80
35.0008      58
39.0007      27
36.9974      22
43.0006      12
40.9973      11
43.9989       8
46.0022       3
56.9969       2
47.9988       2
51.9987       1
50.0021       1
Name: count, dtype: int64

In [33]:
df=df[df['windspeed']<35]
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,week_days,count_cat
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,5,low
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,5,low
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,5,low
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,5,low
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,5,low


In [34]:
df=df.drop(['count', 'datetime'], axis=1)
df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,year,month,week_days,count_cat
0,1,0,0,1,9.84,14.395,81,0.0,3,13,2011,1,5,low
1,1,0,0,1,9.02,13.635,80,0.0,8,32,2011,1,5,low
2,1,0,0,1,9.02,13.635,80,0.0,5,27,2011,1,5,low
3,1,0,0,1,9.84,14.395,75,0.0,3,10,2011,1,5,low
4,1,0,0,1,9.84,14.395,75,0.0,0,1,2011,1,5,low


In [35]:
df.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'casual', 'registered', 'year', 'month',
       'week_days', 'count_cat'],
      dtype='object')

In [37]:

categoricals=['season', 'holiday', 'workingday', 'weather', 'month','week_days']
numericals=['temp', 'atemp','humidity', 'windspeed', 'casual', 'registered']
target='count_cat'

In [39]:
# building pipeline
cat_transformer=OneHotEncoder(handle_unknown='ignore')
num_transformer=StandardScaler()
preprocessor=ColumnTransformer(transformers=[('cat', cat_transformer, categoricals), 
                                             ('num', num_transformer, numericals)])

In [40]:
X_train, X_test, y_train, y_test=train_test_split(df[categoricals + numericals], df[target], test_size=0.2, random_state=0)

In [41]:
X_train_transformed=preprocessor.fit_transform(X_train)
X_test_transformed=preprocessor.fit_transform(X_test)

In [42]:
X_train_transformed.shape

(8590, 36)