## Linear Regression

### Part 1 - Data Preprocessing

### Importing the dataset

In [30]:
import pandas as pd
# variable and this is a function for uploading the dataset
dataset = pd.read_csv('hour.csv')
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [31]:
# Data Cleaning

# Assuming the date column is named 'dteday'
dataset['dteday'] = pd.to_datetime(dataset['dteday'])

# Extract day since it is missing in dataset
dataset['day'] = dataset['dteday'].dt.day

# Drop the original instant, dteday, casual and registered column 
dataset.drop('instant', axis=1, inplace=True)
dataset.drop('dteday', axis=1, inplace=True)
dataset.drop('casual', axis=1, inplace=True)
dataset.drop('registered', axis=1, inplace=True)

# Change cnt column to total counts of bike rented per day
dataset.rename(columns ={'cnt':'total counts'}, inplace=True)

# Change order of columns
order = ['season', 'yr', 'mnth', 'day','hr' , 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'total counts']
dataset = dataset[order]

In [32]:
# data points collected from a bike sharing per day
# 13 columns: season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, total count
# independent variables: season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed
# dependent variable: total count (total count of rented bikes per day)

# season : season (1:springer, 2:summer, 3:fall, 4:winter)
# yr : year (0: 2011, 1:2012)
# mnth : month ( 1 to 12)
# hour : hour (0 to 23)
# holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
# weekday : day of the week
# workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
# weathersit : 	
#             1: Clear, Few clouds, Partly cloudy, Partly cloudy 
#             2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
#             3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
#             4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog]

# temp : Normalized temperature in Celsius. The values are divided to 41 (max)
# atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
# hum: Normalized humidity. The values are divided to 100 (max)
# windspeed: Normalized wind speed. The values are divided to 67 (max)
# total count: count of total rental bikes including both casual and registered

In [33]:
# display 10 rows of dataset
dataset.shape
dataset.head(10) 

Unnamed: 0,season,yr,mnth,day,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,total counts
0,1,0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,1,0,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,1,0,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,1,0,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,1,0,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1
5,1,0,1,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,1
6,1,0,1,1,6,0,6,0,1,0.22,0.2727,0.8,0.0,2
7,1,0,1,1,7,0,6,0,1,0.2,0.2576,0.86,0.0,3
8,1,0,1,1,8,0,6,0,1,0.24,0.2879,0.75,0.0,8
9,1,0,1,1,9,0,6,0,1,0.32,0.3485,0.76,0.0,14


In [34]:
dataset.describe()

Unnamed: 0,season,yr,mnth,day,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,total counts
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,2.50164,0.502561,6.537775,15.683411,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,189.463088
std,1.106918,0.500008,3.438776,8.789373,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,181.387599
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,1.0
25%,2.0,0.0,4.0,8.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,40.0
50%,3.0,1.0,7.0,16.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,142.0
75%,3.0,1.0,10.0,23.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,281.0
max,4.0,1.0,12.0,31.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,977.0


### Getting the inputs and output

In [35]:
# [rows,columns]
X = dataset.iloc[:,:-1].values
X

array([[ 1.    ,  0.    ,  1.    , ...,  0.2879,  0.81  ,  0.    ],
       [ 1.    ,  0.    ,  1.    , ...,  0.2727,  0.8   ,  0.    ],
       [ 1.    ,  0.    ,  1.    , ...,  0.2727,  0.8   ,  0.    ],
       ...,
       [ 1.    ,  1.    , 12.    , ...,  0.2576,  0.6   ,  0.1642],
       [ 1.    ,  1.    , 12.    , ...,  0.2727,  0.56  ,  0.1343],
       [ 1.    ,  1.    , 12.    , ...,  0.2727,  0.65  ,  0.1343]])

In [36]:
y = dataset.iloc[:,-1].values
y

array([16, 40, 32, ..., 90, 61, 49])

### Creating the Training Set and the Test Set

In [37]:
# scikitlearn is a library
# model_selection is a module
# train_test_split is a function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [38]:
X_train

array([[ 1.    ,  1.    ,  1.    , ...,  0.2727,  0.61  ,  0.2985],
       [ 1.    ,  0.    , 12.    , ...,  0.303 ,  0.65  ,  0.1642],
       [ 1.    ,  0.    ,  3.    , ...,  0.4545,  0.94  ,  0.3284],
       ...,
       [ 1.    ,  1.    ,  2.    , ...,  0.2121,  0.6   ,  0.3582],
       [ 2.    ,  1.    ,  3.    , ...,  0.4242,  0.77  ,  0.2537],
       [ 2.    ,  0.    ,  4.    , ...,  0.6212,  0.47  ,  0.3582]])

In [39]:
X_test

array([[ 2.    ,  0.    ,  5.    , ...,  0.5303,  0.88  ,  0.2239],
       [ 4.    ,  0.    , 10.    , ...,  0.4394,  0.88  ,  0.    ],
       [ 4.    ,  1.    , 10.    , ...,  0.4394,  0.51  ,  0.1343],
       ...,
       [ 3.    ,  0.    ,  8.    , ...,  0.5909,  0.73  ,  0.1045],
       [ 1.    ,  1.    , 12.    , ...,  0.2424,  0.7   ,  0.1045],
       [ 4.    ,  1.    , 11.    , ...,  0.2879,  0.56  ,  0.2239]])

In [40]:
y_train

array([157, 164,  79, ...,   6,  69, 530])

In [41]:
y_test

array([  7,   5, 743, ...,  34,   7, 371])

## Part 2 - Building and training the model

### Building the model

In [42]:
# linear_model is the module
# `LinearRegression is a class` is defining that `LinearRegression` is a class within the `linear_model` module. It indicates that `LinearRegression` is a blueprint or template for creating objects that represent linear regression models.
# Class is a pre-coded blueprint of something we want to build from which objects are created.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [43]:
print(X_train.shape)
print(y_train.shape)

print(X_train.dtype) 
print(y_train.dtype) 

pd.isnull(X_train).sum()

(13903, 13)
(13903,)
float64
int64


np.int64(0)

In [44]:
# fit is a method inside LinearRegression class - they are like functions.
model.fit(X_train, y_train)

### Inference

In [45]:
y_pred = model.predict(X_test)
y_pred

array([ 63.49167535,  58.27855639, 331.141635  , ...,  80.96561213,
        65.82703159, 277.22351322])

#### Making the prediction of a single data point - season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, and windspeed.

In [46]:
# The data are from row 10 which has a total count of 1321 bike rented during that day.
model.predict([[1, 	0, 	1, 	1, 	9, 	0, 	6, 	0, 	1, 	0.32, 	0.3485, 	0.76, 	0.0000]]) 

array([29.24730738])

## Part 3: Evaluating the Model

### R-Squared

In [47]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.4028680681854181

### Adjusted R-Squared

In [48]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
adj_r2

0.4006258050099156