## Linear Regression

### Part 1 - Data Preprocessing

### Importing the dataset

In [64]:
import pandas as pd
# variable and this is a function for uploading the dataset
dataset = pd.read_csv('day.csv')
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [65]:
# Data Cleaning

# Assuming the date column is named 'dteday'
dataset['dteday'] = pd.to_datetime(dataset['dteday'])

# Extract day since it is missing in dataset
dataset['day'] = dataset['dteday'].dt.day

# Drop the original instant, dteday, casual and registered column 
dataset.drop('instant', axis=1, inplace=True)
dataset.drop('dteday', axis=1, inplace=True)
dataset.drop('casual', axis=1, inplace=True)
dataset.drop('registered', axis=1, inplace=True)

# Change cnt column to total counts of bike rented per day
dataset.rename(columns ={'cnt':'total counts'}, inplace=True)

# Change order of columns
order = ['season', 'yr', 'mnth', 'day', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'total counts']
dataset = dataset[order]

In [66]:
# data points collected from a bike sharing per day
# 13 columns: season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, total count
# independent variables: season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed
# dependent variable: total count (total count of rented bikes per day)

# season : season (1:springer, 2:summer, 3:fall, 4:winter)
# yr : year (0: 2011, 1:2012)
# mnth : month ( 1 to 12)
# day : day (1 to 31)
# holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
# weekday : day of the week
# workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
# weathersit : 	
#             1: Clear, Few clouds, Partly cloudy, Partly cloudy 
#             2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
#             3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
#             4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog]

# temp : Normalized temperature in Celsius. The values are divided to 41 (max)
# atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
# hum: Normalized humidity. The values are divided to 100 (max)
# windspeed: Normalized wind speed. The values are divided to 67 (max)
# total count: count of total rental bikes including both casual and registered

In [67]:
# display 10 rows of dataset
dataset.shape
dataset.head(10) 

Unnamed: 0,season,yr,mnth,day,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,total counts
0,1,0,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,0,1,2,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,0,1,3,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,0,1,4,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,1,0,1,5,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600
5,1,0,1,6,0,4,1,1,0.204348,0.233209,0.518261,0.089565,1606
6,1,0,1,7,0,5,1,2,0.196522,0.208839,0.498696,0.168726,1510
7,1,0,1,8,0,6,0,2,0.165,0.162254,0.535833,0.266804,959
8,1,0,1,9,0,0,0,1,0.138333,0.116175,0.434167,0.36195,822
9,1,0,1,10,0,1,1,1,0.150833,0.150888,0.482917,0.223267,1321


In [68]:
dataset.describe()

Unnamed: 0,season,yr,mnth,day,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,total counts
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,2.49658,0.500684,6.519836,15.738714,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,4504.348837
std,1.110807,0.500342,3.451913,8.809949,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,1937.211452
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,22.0
25%,2.0,0.0,4.0,8.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,3152.0
50%,3.0,1.0,7.0,16.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,4548.0
75%,3.0,1.0,10.0,23.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,5956.0
max,4.0,1.0,12.0,31.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,8714.0


### Getting the inputs and output

In [69]:
# [rows,columns]
X = dataset.iloc[:,:-1].values
X

array([[ 1.      ,  0.      ,  1.      , ...,  0.363625,  0.805833,
         0.160446],
       [ 1.      ,  0.      ,  1.      , ...,  0.353739,  0.696087,
         0.248539],
       [ 1.      ,  0.      ,  1.      , ...,  0.189405,  0.437273,
         0.248309],
       ...,
       [ 1.      ,  1.      , 12.      , ...,  0.2424  ,  0.752917,
         0.124383],
       [ 1.      ,  1.      , 12.      , ...,  0.2317  ,  0.483333,
         0.350754],
       [ 1.      ,  1.      , 12.      , ...,  0.223487,  0.5775  ,
         0.154846]])

In [70]:
y = dataset.iloc[:,-1].values
y

array([ 985,  801, 1349, 1562, 1600, 1606, 1510,  959,  822, 1321, 1263,
       1162, 1406, 1421, 1248, 1204, 1000,  683, 1650, 1927, 1543,  981,
        986, 1416, 1985,  506,  431, 1167, 1098, 1096, 1501, 1360, 1526,
       1550, 1708, 1005, 1623, 1712, 1530, 1605, 1538, 1746, 1472, 1589,
       1913, 1815, 2115, 2475, 2927, 1635, 1812, 1107, 1450, 1917, 1807,
       1461, 1969, 2402, 1446, 1851, 2134, 1685, 1944, 2077,  605, 1872,
       2133, 1891,  623, 1977, 2132, 2417, 2046, 2056, 2192, 2744, 3239,
       3117, 2471, 2077, 2703, 2121, 1865, 2210, 2496, 1693, 2028, 2425,
       1536, 1685, 2227, 2252, 3249, 3115, 1795, 2808, 3141, 1471, 2455,
       2895, 3348, 2034, 2162, 3267, 3126,  795, 3744, 3429, 3204, 3944,
       4189, 1683, 4036, 4191, 4073, 4400, 3872, 4058, 4595, 5312, 3351,
       4401, 4451, 2633, 4433, 4608, 4714, 4333, 4362, 4803, 4182, 4864,
       4105, 3409, 4553, 3958, 4123, 3855, 4575, 4917, 5805, 4660, 4274,
       4492, 4978, 4677, 4679, 4758, 4788, 4098, 39

### Creating the Training Set and the Test Set

In [71]:
# scikitlearn is a library
# model_selection is a module
# train_test_split is a function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
X_train

array([[ 4.       ,  1.       , 11.       , ...,  0.323225 ,  0.662917 ,
         0.342046 ],
       [ 3.       ,  0.       ,  9.       , ...,  0.555361 ,  0.939565 ,
         0.192748 ],
       [ 4.       ,  0.       , 12.       , ...,  0.310604 ,  0.612917 ,
         0.0957833],
       ...,
       [ 4.       ,  0.       ,  9.       , ...,  0.575158 ,  0.84875  ,
         0.148629 ],
       [ 1.       ,  1.       ,  3.       , ...,  0.35967  ,  0.476957 ,
         0.222587 ],
       [ 2.       ,  0.       ,  4.       , ...,  0.417283 ,  0.819167 ,
         0.250617 ]])

In [73]:
X_test

array([[ 4.       ,  1.       , 12.       , ...,  0.469054 ,  0.73375  ,
         0.174129 ],
       [ 1.       ,  0.       ,  2.       , ...,  0.177878 ,  0.437826 ,
         0.277752 ],
       [ 4.       ,  0.       , 10.       , ...,  0.318812 ,  0.585833 ,
         0.229479 ],
       ...,
       [ 4.       ,  0.       , 11.       , ...,  0.380671 ,  0.64375  ,
         0.0988958],
       [ 2.       ,  0.       ,  6.       , ...,  0.587754 ,  0.471667 ,
         0.167912 ],
       [ 2.       ,  0.       ,  5.       , ...,  0.550512 ,  0.787917 ,
         0.126871 ]])

In [74]:
y_train

array([4094, 1842, 3614, 4274, 7335, 4123, 4576, 1510, 3520, 4151, 6779,
       2424, 4677, 6233, 4105, 4205, 5323, 1746, 3204, 6192, 1969, 3613,
       4390, 7006, 7148, 1985, 3831, 4569, 5585, 4916, 5823, 4189, 1865,
       1107, 7055, 7534, 5499, 7570, 8555, 7498, 2236, 5634, 1650, 7665,
       3005, 4195, 5026, 2210, 1685, 3606, 1834, 1096, 5191, 4845, 6169,
       5532, 5119, 4073, 2710, 8120, 2423, 7444, 6660, 7534, 3846, 5558,
       3422, 5305, 2659, 7338, 6978, 4795, 3855, 4905, 5445, 5698, 5020,
       3624, 7572, 6883, 4075, 4521, 3727, 5936, 3126, 4073, 7509, 4334,
       7461, 4308, 3249,  959, 1685, 4672, 3243, 5986, 5315, 6883, 5531,
       2732, 3922, 1162, 5805, 1098, 1589, 1472, 2056, 4844, 5255, 6778,
       4154, 4648, 6133, 7132, 4708, 7852, 6073, 3659, 4187, 1204, 3423,
       4785, 5464, 2765, 4127, 7421, 3523, 5511, 2935, 1321, 1996,  986,
       4780, 4220, 3786, 3777, 5810, 3285, 8362, 3115, 2914, 3376, 4322,
       2416, 6043, 6544, 3068, 5312, 2744, 3214, 48

In [75]:
y_test

array([6606, 1550, 3747, 6041, 7538, 7264, 1605, 2209, 7499, 5743, 1796,
       3068, 4891, 5260, 2133, 2471, 2046, 8156, 5362, 2298, 7697, 5463,
       5409, 1872, 1807, 5130, 2121, 7436, 3830, 5557, 2743, 3644, 6196,
       7494, 5918, 3372, 7582, 6053, 2566, 1263, 3944, 3956, 7580, 4906,
       6966,  705, 4458, 5298, 6043, 4996, 3351, 2431, 1011, 4475, 4725,
       4727, 2395, 3351, 4788, 7175, 6153, 7442, 1471, 7865, 6530, 6211,
       7403, 4302, 2077, 7333, 3117, 1635, 3811, 4595, 4363, 2034, 5686,
       4748, 1416, 4401, 2114, 2028, 5668,   22, 3940, 4118, 5423, 1495,
       3620, 3403, 1501, 7040, 5992, 4990, 3095, 2832, 4713, 2368, 3409,
       4991, 7713, 3785, 6664, 2496, 2077, 5267, 1162, 2739, 6861, 1360,
       4602, 7282, 3570, 8167, 6230, 4511, 1461, 3272, 5923, 2177, 6398,
       1817, 3239, 1349, 3867, 5146, 4541, 3368, 2134, 7410, 3331, 3974,
       5495, 5319, 5895, 2227, 4367, 7460, 1977, 4266, 4864, 1913, 2132,
       4258, 2792, 5180, 3958])

## Part 2 - Building and training the model

### Building the model

In [76]:
# linear_model is the module
# `LinearRegression is a class` is defining that `LinearRegression` is a class within the `linear_model` module. It indicates that `LinearRegression` is a blueprint or template for creating objects that represent linear regression models.
# Class is a pre-coded blueprint of something we want to build from which objects are created.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [77]:
print(X_train.shape)
print(y_train.shape)

print(X_train.dtype) 
print(y_train.dtype) 

pd.isnull(X_train).sum()

(584, 12)
(584,)
float64
int64


np.int64(0)

In [78]:
# fit is a method inside LinearRegression class - they are like functions.
model.fit(X_train, y_train)

### Inference

In [79]:
y_pred = model.predict(X_test)
y_pred

array([6292.02707192, 1648.43175475, 2966.47184684, 4387.17228853,
       6708.33960933, 7263.45368463,  808.66013295, 2101.40628481,
       7116.09239823, 6389.13554358, 2812.17189987, 2078.21493673,
       4078.21609481, 5377.0024401 , 2425.05496112, 1988.17915713,
       2392.87701296, 7491.27496558, 5758.86616162, 2937.57937256,
       7148.96724338, 8151.63699446, 5185.90425251, 1510.7130368 ,
       1343.36152282, 4777.41472045, 1956.22340237, 6679.49114033,
       4164.00275042, 5825.1869181 , 3052.13358886, 4146.62230434,
       5877.11670096, 6393.39149855, 5433.70145512, 4920.95228745,
       6791.57680748, 6072.56287391, 3067.30041909,  979.26461143,
       4052.43705132, 4029.84205317, 7327.3915253 , 3512.54070391,
       7213.8422724 , 2280.27006688, 5152.45206969, 5028.14724477,
       5977.49676982, 4113.28090068, 3420.45045887, 2731.9288198 ,
       1876.83671067, 5743.28234608, 4918.65760704, 4352.15737539,
       4328.81793945, 2552.22527824, 3800.55988457, 6987.51375

#### Making the prediction of a single data point - season, yr, mnth, day, holiday, weekday, workingday, weathersit, temp, atemp, hum, and windspeed.

In [80]:
# The data are from row 10 which has a total count of 1321 bike rented during that day.
model.predict([[1, 	0, 	1, 	10, 	0, 	1, 	1, 	1, 	0.150833, 	0.150888, 	0.482917, 	0.223267]]) 

array([1333.34757555])

## Part 3: Evaluating the Model

### R-Squared

In [81]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.8303394873121542

### Adjusted R-Squared

In [82]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
adj_r2

0.815146008563989