## Linear Regression

### Part 1 - Data Preprocessing

### Importing the dataset

In [259]:
import pandas as pd
#variable and this is a function for uploading the dataset
dataset = pd.read_csv('hour.csv')

In [260]:
#Data Cleaning
dataset['dteday'] = pd.to_datetime(dataset['dteday'])

# Extract year, month, day, hour (or other desired features)
dataset['year'] = dataset['dteday'].dt.year
dataset['month'] = dataset['dteday'].dt.month
dataset['day'] = dataset['dteday'].dt.day

# Add column for hour
dataset['hour'] = dataset['hr']

# Drop the original 'dteday' column (now safe)
dataset.drop('dteday', axis=1, inplace=True)

In [261]:
#display 10 rows of dataset
dataset.head(10) #change 10
# data points collected from a bike sharing per day
# 19 columns: instant, season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, casual, registered, 	cnt, year, month, day
# independent variables: instant, season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, year, month, day
# dependent variable: casual, registered, cnt (total count of rented bikes per hour)

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month,day,hour
0,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,2011,1,1,0
1,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,2011,1,1,1
2,3,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,2011,1,1,2
3,4,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,2011,1,1,3
4,5,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,2011,1,1,4
5,6,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,0,1,1,2011,1,1,5
6,7,1,0,1,6,0,6,0,1,0.22,0.2727,0.8,0.0,2,0,2,2011,1,1,6
7,8,1,0,1,7,0,6,0,1,0.2,0.2576,0.86,0.0,1,2,3,2011,1,1,7
8,9,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0.0,1,7,8,2011,1,1,8
9,10,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0.0,8,6,14,2011,1,1,9


### Getting the inputs and output

In [262]:
# [rows,columns]
X = dataset.iloc[:,1:13].values
X

array([[ 1.    ,  0.    ,  1.    , ...,  0.2879,  0.81  ,  0.    ],
       [ 1.    ,  0.    ,  1.    , ...,  0.2727,  0.8   ,  0.    ],
       [ 1.    ,  0.    ,  1.    , ...,  0.2727,  0.8   ,  0.    ],
       ...,
       [ 1.    ,  1.    , 12.    , ...,  0.2576,  0.6   ,  0.1642],
       [ 1.    ,  1.    , 12.    , ...,  0.2727,  0.56  ,  0.1343],
       [ 1.    ,  1.    , 12.    , ...,  0.2727,  0.65  ,  0.1343]])

In [263]:
y = dataset.iloc[:,-5:-2].values
y

array([[  16, 2011,    1],
       [  40, 2011,    1],
       [  32, 2011,    1],
       ...,
       [  90, 2012,   12],
       [  61, 2012,   12],
       [  49, 2012,   12]])

### Creating the Training Set and the Test Set

In [264]:
# scikitlearn is a library
# model_selection is a module
# train_test_split is a function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [265]:
X_train

array([[ 1.    ,  0.    ,  1.    , ...,  0.197 ,  0.55  ,  0.2239],
       [ 4.    ,  0.    , 10.    , ...,  0.5   ,  0.42  ,  0.1045],
       [ 4.    ,  0.    , 12.    , ...,  0.4545,  1.    ,  0.2239],
       ...,
       [ 3.    ,  0.    ,  8.    , ...,  0.7273,  0.43  ,  0.2836],
       [ 1.    ,  0.    ,  2.    , ...,  0.197 ,  0.65  ,  0.4179],
       [ 4.    ,  1.    , 10.    , ...,  0.5   ,  0.83  ,  0.1642]])

In [266]:
X_test

array([[ 3.    ,  1.    ,  6.    , ...,  0.697 ,  0.27  ,  0.194 ],
       [ 1.    ,  1.    ,  1.    , ...,  0.2273,  0.41  ,  0.2239],
       [ 4.    ,  0.    , 10.    , ...,  0.303 ,  0.66  ,  0.2836],
       ...,
       [ 1.    ,  0.    ,  1.    , ...,  0.1818,  0.59  ,  0.3582],
       [ 4.    ,  1.    , 11.    , ...,  0.4697,  0.77  ,  0.1642],
       [ 4.    ,  0.    , 12.    , ...,  0.3182,  0.49  ,  0.1642]])

In [267]:
y_train

array([[  72, 2011,    1],
       [ 518, 2011,   10],
       [   3, 2011,   12],
       ...,
       [ 189, 2011,    8],
       [ 100, 2011,    2],
       [ 779, 2012,   10]])

In [268]:
y_test

array([[ 425, 2012,    6],
       [  88, 2012,    1],
       [   4, 2011,   10],
       ...,
       [  98, 2011,    1],
       [ 266, 2012,   11],
       [ 267, 2011,   12]])

## Part 2 - Building and training the model

### Building the model

In [269]:
# linear_model is the module
# LinearRegression is a class` is defining that `LinearRegression` is a class within the `linear_model` module. It indicates that `LinearRegression` is a blueprint or template for creating objects that represent linear regression models.
# Class is a pre-coded blueprint of something we want to build from which objects are created.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [270]:
print(X_train.shape)
print(y_train.shape)

print(X_train.dtype) 
print(y_train.dtype) 

pd.isnull(X_train).sum()

(13903, 12)
(13903, 3)
float64
int64


np.int64(0)

In [271]:
# fit is a method inside LinearRegression class - they are like functions.
model.fit(X_train, y_train)

### Inference

In [272]:
y_pred = model.predict(X_test)
y_pred

array([[4.50296525e+02, 2.01200000e+03, 6.00000000e+00],
       [2.04674145e+02, 2.01200000e+03, 1.00000000e+00],
       [5.72978909e+01, 2.01100000e+03, 1.00000000e+01],
       ...,
       [4.55412627e+01, 2.01100000e+03, 1.00000000e+00],
       [1.90389370e+02, 2.01200000e+03, 1.10000000e+01],
       [2.15373094e+02, 2.01100000e+03, 1.20000000e+01]])

#### Making the prediction of a single data point - season, yr, hr, mnth, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed

In [273]:
model.predict = [1, 0, 1, 8, 0, 5, 0, 2, 0.35, 0.36, 0.8, 0.16]


## Part 3: Evaluating the Model

### R-Squared

In [274]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.795993719437971

### Adjusted R-Squared

In [275]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
adj_r2

0.7952867961440802