# Homework 10 Programming

## Task A: Data Manipulation

#### Import data

In [55]:
import pandas as pd
import numpy as np

In [56]:
df = pd.read_csv('DC_Bike_Rentals.csv')

df.head()

Unnamed: 0,hour,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,0,1,0,0,1,9.84,14.395,81,0.0,16
1,1,1,0,0,1,9.02,13.635,80,0.0,40
2,2,1,0,0,1,9.02,13.635,80,0.0,32
3,3,1,0,0,1,9.84,14.395,75,0.0,13
4,4,1,0,0,1,9.84,14.395,75,0.0,1


#### Show data types

In [57]:
df.dtypes

hour            int64
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
count           int64
dtype: object

#### Seperate into predictors and target

In [58]:
y, X = df['count'], df.loc[:, df.columns != 'count']

#### Split into test and training datasets

In [59]:
#import module
from sklearn.model_selection import train_test_split

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)


In [60]:
#show results
X_train.shape

(6531, 9)

In [61]:
y_train.shape

(6531,)

In [62]:
X_test.shape

(4355, 9)

In [63]:
y_test.shape

(4355,)

## Task B: Predictive Modeling

### Normalize data

In [64]:
#import module
from sklearn.preprocessing import MinMaxScaler

#fit to training set
scaler = MinMaxScaler().fit(X_train)

In [65]:
# Transform training X
train_X_scale = scaler.transform(X_train)
train_X_scale = pd.DataFrame(train_X_scale)
train_X_scale.columns = X_train.columns

train_X_scale.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hour,6531.0,0.499764,0.301422,0.0,0.217391,0.521739,0.73913,1.0
season,6531.0,0.502475,0.371898,0.0,0.333333,0.666667,1.0,1.0
holiday,6531.0,0.029398,0.168933,0.0,0.0,0.0,0.0,1.0
workingday,6531.0,0.686112,0.464107,0.0,0.0,1.0,1.0,1.0
weather,6531.0,0.138008,0.209136,0.0,0.0,0.0,0.333333,1.0
temp,6531.0,0.483818,0.194515,0.0,0.326531,0.489796,0.632653,1.0
atemp,6531.0,0.512886,0.190157,0.0,0.355856,0.525338,0.677928,1.0
humidity,6531.0,0.618331,0.191719,0.0,0.47,0.62,0.77,1.0
windspeed,6531.0,0.22455,0.143902,0.0,0.12284,0.228047,0.298225,1.0


In [66]:
# Transform test X
test_X_scale = scaler.transform(X_test)
test_X_scale = pd.DataFrame(test_X_scale)
test_X_scale.columns = X_test.columns

test_X_scale.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hour,4355.0,0.504877,0.299593,0.0,0.26087,0.521739,0.782609,1.0
season,4355.0,0.501799,0.37234,0.0,0.333333,0.666667,1.0,1.0
holiday,4355.0,0.027325,0.163047,0.0,0.0,0.0,0.0,1.0
workingday,4355.0,0.67302,0.469164,0.0,0.0,1.0,1.0,1.0
weather,4355.0,0.141676,0.214459,0.0,0.0,0.0,0.333333,0.666667
temp,4355.0,0.482017,0.193034,0.0,0.326531,0.489796,0.632653,0.959184
atemp,4355.0,0.5113,0.188803,0.0,0.355856,0.525338,0.677928,0.982996
humidity,4355.0,0.619665,0.193562,0.0,0.47,0.62,0.78,1.0
windspeed,4355.0,0.224583,0.142271,0.0,0.12284,0.228047,0.298225,1.0


### k-NN Prediction

First, a prediction with default parameters will set a baseline to compare to the tuned model

In [67]:
#import regression module
from sklearn.neighbors import KNeighborsClassifier

# KNN: K=5, default distance measurement
knn = KNeighborsClassifier()

#fit data
knn.fit(train_X_scale, y_train)

In [68]:
#import rmse module
from sklearn.metrics import mean_squared_error

#predict data
pred_knn = knn.predict(test_X_scale)

#compare values
print("Root mean squared error is: " + str(mean_squared_error(y_test, pred_knn, squared=False)))

Root mean squared error is: 143.52854958682707


### Hyperparameter Tuning

now, the neighbors parameter needs to be tuned, and accuracy statistics need to be printed out to compare for each neighbor

In [73]:
#import other accuracy modules
from sklearn.metrics import mean_absolute_error, r2_score

#change hyperparameters, show accuracy statistics
for k in range(15):
    k = k + 1
    knn = KNeighborsClassifier(n_neighbors = k, 
                                         weights='uniform', 
                                         algorithm='auto')
    knn.fit(train_X_scale, y_train)
    pred_y = knn.predict(test_X_scale)
    print("the number of neighbors is " + str(k))
    print("Root mean squared error is: " + str(mean_squared_error(y_test, pred_y, squared=False)))
    print("Mean absolute error is: " + str(mean_absolute_error(y_test, pred_y)))
    print("R-squared score is: " + str(r2_score(y_test, pred_y)))
    print("")
    

the number of neighbors is 1
Root mean squared error is: 114.89294472927773
Mean absolute error is: 72.70470723306545
R-squared score is: 0.5915303133539932

the number of neighbors is 2
Root mean squared error is: 113.96718261904037
Mean absolute error is: 70.99586681974742
R-squared score is: 0.5980863692571262

the number of neighbors is 3
Root mean squared error is: 126.18865653412547
Mean absolute error is: 78.40642939150402
R-squared score is: 0.5072646232743265

the number of neighbors is 4
Root mean squared error is: 135.51223651646595
Mean absolute error is: 84.801607347876
R-squared score is: 0.431762178148158

the number of neighbors is 5
Root mean squared error is: 143.52854958682707
Mean absolute error is: 90.67554535017221
R-squared score is: 0.3625447485710279

the number of neighbors is 6
Root mean squared error is: 149.01211240730484
Mean absolute error is: 95.07256027554536
R-squared score is: 0.31290584571584734

the number of neighbors is 7
Root mean squared error i