# Linear Regression

Import required libraries pandas - to work with data from csv file numpy - to work with array matplotlib - visualization of the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Reading the CSV file as dataframe
df = pd.read_csv("nyc-taxi-sample-data.csv")
df.head() #display the first 5 rows of the dataframe

Unnamed: 0,vendorID,passengerCount,tripDistance,hour_of_day,day_of_week,day_of_month,month_num,normalizeHolidayName,isPaidTimeOff,snowDepth,precipTime,precipDepth,temperature,totalAmount
0,1,1,9.4,15,2,27,1,,False,29.058824,24.0,3.0,6.185714,44.3
1,2,5,14.75,13,4,15,1,,False,0.0,6.0,0.0,4.57193,44.8
2,2,1,3.35,23,4,8,1,,False,0.0,1.0,0.0,4.384091,18.96
3,2,1,3.33,18,2,27,1,,False,29.058824,24.0,3.0,6.185714,16.3
4,2,1,0.47,17,6,3,1,,False,0.0,1.0,0.0,3.846429,5.3


In [3]:
#Check the shape (number of rows and columns) of the dataset
df.shape

(11734, 14)

In [4]:
#Check the data type of all the variables in the dataset
df.dtypes

vendorID                  int64
passengerCount            int64
tripDistance            float64
hour_of_day               int64
day_of_week               int64
day_of_month              int64
month_num                 int64
normalizeHolidayName     object
isPaidTimeOff              bool
snowDepth               float64
precipTime              float64
precipDepth             float64
temperature             float64
totalAmount             float64
dtype: object

Categorical variables are stored as object and Continuous variables are stored as int or float 

In [5]:
#Check all the unique values of Categorical Variables
df['normalizeHolidayName'].unique()

array(['None', 'Martin Luther King, Jr. Day', "New Year's Day",
       "Washington's Birthday", 'Memorial Day'], dtype=object)

In [6]:
#Since the holiday names aren't of importance we can drop that column
#axis=0 indicates row and we have to specify the index
#axis=1 indicates column and we have to specify the column names
df = df.drop(['normalizeHolidayName'],axis=1)

In [7]:
#description of continuous variables
df.describe()

Unnamed: 0,vendorID,passengerCount,tripDistance,hour_of_day,day_of_week,day_of_month,month_num,snowDepth,precipTime,precipDepth,temperature,totalAmount
count,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0,11734.0
mean,1.790608,1.34856,2.866139,13.633884,3.223879,15.000256,3.502898,1.609015,12.028379,190.782342,10.314244,14.733528
std,0.406892,1.016123,2.90581,6.67053,1.961855,8.467892,1.707729,7.146771,10.158597,1211.087724,8.5006,10.983099
min,1.0,1.0,0.01,0.0,0.0,1.0,1.0,0.0,1.0,0.0,-13.379464,3.3
25%,2.0,1.0,1.06,9.0,2.0,8.0,2.0,0.0,1.0,0.0,3.566372,8.15
50%,2.0,1.0,1.9,15.0,3.0,15.0,4.0,0.0,6.0,3.0,10.318229,11.3
75%,2.0,1.0,3.62,19.0,5.0,22.0,5.0,0.0,24.0,41.0,17.239744,17.8
max,2.0,6.0,62.55,23.0,6.0,30.0,6.0,67.090909,24.0,9999.0,26.524107,339.38


This shows all the statistical values (count, mean, standard deviation etc.) related to the continuous variables present in the dataframe

CHECK FOR MISSING VALUES - since the count of all the variables is the same, we can say that there are no missing values in the continuous variables

In [8]:
#Map the boolean value into 0 and 1
df['isPaidTimeOff'].map({False:0, True:1})

0        0
1        0
2        0
3        0
4        0
        ..
11729    0
11730    0
11731    0
11732    0
11733    0
Name: isPaidTimeOff, Length: 11734, dtype: int64

In [9]:
#Check the number of variables in each category
df['isPaidTimeOff'].value_counts()

False    11484
True       250
Name: isPaidTimeOff, dtype: int64

250 objects have True for the variable isPaidTimeOff and others have False

NOW THE DATA HAS NO MISSING / UNWANTED VALUES AND ALL THE VALUES ARE NUMERICAL

In [10]:
#Separate the independent and dependent variables
X = df.drop(['totalAmount'],axis=1)
Y = df[['totalAmount']]

In [11]:
#Split the dataset into train and test dataset
#We have a built-in funcion to split the dataset randomly
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.4) #40% of dataset will be test data

In [12]:
#Check the shape of the train and test data
print("Shape of xtrain: ",xtrain.shape)
print("Shape of xtest: ",xtest.shape)
print("Shape of ytrain: ",ytrain.shape)
print("Shape of ytest: ",ytest.shape)

Shape of xtrain:  (7040, 12)
Shape of xtest:  (4694, 12)
Shape of ytrain:  (7040, 1)
Shape of ytest:  (4694, 1)


NOW THE DATASET IS DIVIDED INTO TRAIN AND TEST DATA

# Build a simple Linear Model

In [13]:
#build the model
from sklearn.linear_model import LinearRegression #import library
lmodel1 = LinearRegression() #create an object

In [14]:
#train the model
lmodel1.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
#to print the coefficients
print(lmodel1.coef_) #coefficient/slope
print(lmodel1.intercept_) #intercept

[[ 3.23984201e-02  5.69472739e-02  3.45817482e+00  3.59659144e-02
  -7.38333947e-02 -1.16652193e-02 -6.86295560e-02 -7.42303317e-01
   1.96102374e-03  4.54809313e-03 -8.37213127e-05  1.70056947e-02]]
[4.61661393]


In [18]:
#make predictions
Yp = lmodel1.predict(xtest)

In [19]:
#to calculate the mean absolute error
from sklearn.metrics import mean_absolute_error
mean1 = mean_absolute_error(ytest,Yp)
print(mean1)

2.1845280573686954


In [20]:
#calculate the r-squared metric
from sklearn.metrics import r2_score
r2score = r2_score(ytest,Yp)
print(r2score)

0.8545994831880224


Since the r-squared value is near to 1 we can say that the model is able to capture all variance in the dependent variable.