## Linear Regression

### Part 1 - Data Preprocessing

### Importing the dataset

In [52]:
import pandas as pd
#variable and this is a function for uploading the dataset
dataset = pd.read_csv('day.csv') 

In [53]:
#Data Cleaning
# Assuming the date column is named 'dteday'
dataset['dteday'] = pd.to_datetime(dataset['dteday'])

# Extract year, month, day (or other desired features)
dataset['year'] = dataset['dteday'].dt.year
dataset['month'] = dataset['dteday'].dt.month
dataset['day'] = dataset['dteday'].dt.day

# Drop the original 'dteday' column
dataset.drop('dteday', axis=1, inplace=True)


In [54]:
#display 10 rows of dataset
dataset.head(10) #change 10
# data points collected from a bike sharing per day
# 18 columns: instant, season, yr, mnth, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, casual, registered, 	cnt, year, month, day
# independent variables: instant, season, yr, mnth, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, year, month, day
# dependent variable: casual, registered, cnt (total count of rented bikes per day)

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month,day
0,1,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1,1
1,2,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1,2
2,3,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1,3
3,4,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1,4
4,5,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1,5
5,6,1,0,1,0,4,1,1,0.204348,0.233209,0.518261,0.089565,88,1518,1606,2011,1,6
6,7,1,0,1,0,5,1,2,0.196522,0.208839,0.498696,0.168726,148,1362,1510,2011,1,7
7,8,1,0,1,0,6,0,2,0.165,0.162254,0.535833,0.266804,68,891,959,2011,1,8
8,9,1,0,1,0,0,0,1,0.138333,0.116175,0.434167,0.36195,54,768,822,2011,1,9
9,10,1,0,1,0,1,1,1,0.150833,0.150888,0.482917,0.223267,41,1280,1321,2011,1,10


### Getting the inputs and output

In [55]:
# [rows,columns]
X = dataset.iloc[:,1:12].values
X

array([[ 1.      ,  0.      ,  1.      , ...,  0.363625,  0.805833,
         0.160446],
       [ 1.      ,  0.      ,  1.      , ...,  0.353739,  0.696087,
         0.248539],
       [ 1.      ,  0.      ,  1.      , ...,  0.189405,  0.437273,
         0.248309],
       ...,
       [ 1.      ,  1.      , 12.      , ...,  0.2424  ,  0.752917,
         0.124383],
       [ 1.      ,  1.      , 12.      , ...,  0.2317  ,  0.483333,
         0.350754],
       [ 1.      ,  1.      , 12.      , ...,  0.223487,  0.5775  ,
         0.154846]])

In [56]:
y = dataset.iloc[:,-4:-1].values
y

array([[ 985, 2011,    1],
       [ 801, 2011,    1],
       [1349, 2011,    1],
       ...,
       [1341, 2012,   12],
       [1796, 2012,   12],
       [2729, 2012,   12]])

### Creating the Training Set and the Test Set

In [57]:
# scikitlearn is a library
# model_selection is a module
# train_test_split is a function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
X_train

array([[ 4.       ,  1.       , 11.       , ...,  0.323225 ,  0.662917 ,
         0.342046 ],
       [ 3.       ,  0.       ,  9.       , ...,  0.555361 ,  0.939565 ,
         0.192748 ],
       [ 4.       ,  0.       , 12.       , ...,  0.310604 ,  0.612917 ,
         0.0957833],
       ...,
       [ 4.       ,  0.       ,  9.       , ...,  0.575158 ,  0.84875  ,
         0.148629 ],
       [ 1.       ,  1.       ,  3.       , ...,  0.35967  ,  0.476957 ,
         0.222587 ],
       [ 2.       ,  0.       ,  4.       , ...,  0.417283 ,  0.819167 ,
         0.250617 ]])

In [59]:
X_test

array([[ 4.       ,  1.       , 12.       , ...,  0.469054 ,  0.73375  ,
         0.174129 ],
       [ 1.       ,  0.       ,  2.       , ...,  0.177878 ,  0.437826 ,
         0.277752 ],
       [ 4.       ,  0.       , 10.       , ...,  0.318812 ,  0.585833 ,
         0.229479 ],
       ...,
       [ 4.       ,  0.       , 11.       , ...,  0.380671 ,  0.64375  ,
         0.0988958],
       [ 2.       ,  0.       ,  6.       , ...,  0.587754 ,  0.471667 ,
         0.167912 ],
       [ 2.       ,  0.       ,  5.       , ...,  0.550512 ,  0.787917 ,
         0.126871 ]])

In [60]:
y_train

array([[4094, 2012,   11],
       [1842, 2011,    9],
       [3614, 2011,   12],
       ...,
       [3907, 2011,    9],
       [4911, 2012,    3],
       [2162, 2011,    4]])

In [61]:
y_test

array([[6606, 2012,   12],
       [1550, 2011,    2],
       [3747, 2011,   10],
       [6041, 2012,    4],
       [7538, 2012,    9],
       [7264, 2012,    7],
       [1605, 2011,    2],
       [2209, 2011,   12],
       [7499, 2012,    7],
       [5743, 2012,    5],
       [1796, 2012,   12],
       [3068, 2011,   12],
       [4891, 2011,    6],
       [5260, 2012,   11],
       [2133, 2011,    3],
       [2471, 2011,    3],
       [2046, 2011,    3],
       [8156, 2012,   10],
       [5362, 2011,    7],
       [2298, 2012,    1],
       [7697, 2012,    8],
       [5463, 2012,    6],
       [5409, 2012,    4],
       [1872, 2011,    3],
       [1807, 2011,    2],
       [5130, 2011,    8],
       [2121, 2011,    3],
       [7436, 2012,    9],
       [3830, 2012,    2],
       [5557, 2012,   12],
       [2743, 2011,   12],
       [3644, 2011,   10],
       [6196, 2012,    4],
       [7494, 2012,    6],
       [5918, 2012,    4],
       [3372, 2012,    3],
       [7582, 2012,    8],
 

## Part 2 - Building and training the model

### Building the model

In [62]:
# linear_model is the module
# `LinearRegression is a class` is defining that `LinearRegression` is a class within the `linear_model` module. It indicates that `LinearRegression` is a blueprint or template for creating objects that represent linear regression models.
# Class is a pre-coded blueprint of something we want to build from which objects are created.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [63]:
print(X_train.shape)
print(y_train.shape)

print(X_train.dtype) 
print(y_train.dtype) 

pd.isnull(X_train).sum()

(584, 11)
(584, 3)
float64
int64


np.int64(0)

In [64]:
# fit is a method inside LinearRegression class - they are like functions.
model.fit(X_train, y_train)

### Inference

In [65]:
y_pred = model.predict(X_test)
y_pred

array([[6.22041661e+03, 2.01200000e+03, 1.20000000e+01],
       [1.57160576e+03, 2.01100000e+03, 2.00000000e+00],
       [3.04396703e+03, 2.01100000e+03, 1.00000000e+01],
       [4.30720248e+03, 2.01200000e+03, 4.00000000e+00],
       [6.76565907e+03, 2.01200000e+03, 9.00000000e+00],
       [7.23153611e+03, 2.01200000e+03, 7.00000000e+00],
       [7.74904147e+02, 2.01100000e+03, 2.00000000e+00],
       [2.13960660e+03, 2.01100000e+03, 1.20000000e+01],
       [7.11274561e+03, 2.01200000e+03, 7.00000000e+00],
       [6.45899217e+03, 2.01200000e+03, 5.00000000e+00],
       [2.89952011e+03, 2.01200000e+03, 1.20000000e+01],
       [2.11878224e+03, 2.01100000e+03, 1.20000000e+01],
       [4.07224222e+03, 2.01100000e+03, 6.00000000e+00],
       [5.45121299e+03, 2.01200000e+03, 1.10000000e+01],
       [2.38453839e+03, 2.01100000e+03, 3.00000000e+00],
       [2.01624400e+03, 2.01100000e+03, 3.00000000e+00],
       [2.38371950e+03, 2.01100000e+03, 3.00000000e+00],
       [7.42859110e+03, 2.01200

#### Making the prediction of a single data point - season, yr, mnth, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed

In [66]:
model.predict = [1, 0, 1, 0, 5, 1, 2, 0.35, 0.36, 0.8, 0.16]


## Part 3: Evaluating the Model

### R-Squared

In [67]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.942555669678907

### Adjusted R-Squared

In [68]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
adj_r2

0.9378750205416327