## In this notebook I will explore different predictive models

In [1]:
!ls -Rl

.:
total 21180
-rwxrwxrwx 1 angelrps angelrps 7384896 Jul  1 17:19  Data_Analysis.ipynb
-rw-rw-rw- 1 angelrps angelrps 2300551 Jul 10 12:07  Data_Analysis_Taxis.ipynb
-rwxrwxrwx 1 angelrps angelrps  371494 Jun 18 11:33  Data_Analysis_Weather.ipynb
-rw-rw-rw- 1 angelrps angelrps  164169 Jul 10 13:04  Data_Join_Taxis_Weather.ipynb
-rw-rw-rw- 1 angelrps angelrps  682153 Jul 21 12:29  FrontEnd_01.ipynb
-rw-rw-rw- 1 angelrps angelrps  114829 Jul 28 11:07  Modelling_01.ipynb
d--------- 1 angelrps angelrps     512 Jul 22 15:13 'PARA SEBASTIEN'
-rw-rw-rw- 1 angelrps angelrps  130423 Jun 23 13:20  Regression_Advanced_Housing_Price-blank-April_27.ipynb
-rw-rw-rw- 1 angelrps angelrps  141159 Jul 28 11:45  Streamlit.ipynb
-rw-rw-rw- 1 angelrps angelrps    1034 Jul 23 15:24  Streamlit_JustRun.ipynb
-rw-rw-rw- 1 angelrps angelrps  614335 Jul 24 16:08 'TSA - Sesión.ipynb'
-rw-rw-rw- 1 angelrps angelrps   11748 Jul 24 15:42  Web_Scrapping_Weather.ipynb
-rwxrwxrwx 1 angelrps angelrps 9036

In [2]:
cd ../data

/home/angelrps/git/MasterDataScience_FinalProject/data


## 1. Import libraries and dataset

In [2]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [3]:
# Import dataset
df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"LocationID" : "object"},
                 parse_dates=['datetime'])

df.info() # check dtypes
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586920 entries, 0 to 586919
Data columns (total 11 columns):
datetime         586920 non-null datetime64[ns]
month            586920 non-null int64
day              586920 non-null int64
hour             586920 non-null int64
LocationID       586920 non-null object
NoOfPickups      586920 non-null float64
year             586920 non-null int64
week             586920 non-null int64
dayofweek        586920 non-null int64
isweekend        586920 non-null int64
precipitation    586920 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(7), object(1)
memory usage: 49.3+ MB


Unnamed: 0,datetime,month,day,hour,LocationID,NoOfPickups,year,week,dayofweek,isweekend,precipitation
0,2017-01-01,1,1,0,4,136.0,2017,52,6,1,0.0
1,2017-01-01,1,1,0,12,3.0,2017,52,6,1,0.0
2,2017-01-01,1,1,0,13,103.0,2017,52,6,1,0.0
3,2017-01-01,1,1,0,24,94.0,2017,52,6,1,0.0
4,2017-01-01,1,1,0,41,136.0,2017,52,6,1,0.0


## 2. Separate variables
INPUT variables: (must be Pandas DataFrame)
    - month
    - day
    - hour
    - week
    - dayofweek
    - isweekend
    - LocationID
    - precipitation
OUTPUT: (must Pandas Series)
    - NoOfPickups

In [7]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Import dataset

df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"LocationID" : "object"},
                 parse_dates=['datetime'])

# 2. Separate variables
X = df[['month','day','hour','week','dayofweek','isweekend','LocationID','precipitation']]
y = df['NoOfPickups']
print("FEATURES 'X': ", type(X)) # 'X' must be DataFrame and 'y' must be a Pandas Series
display(X.head(),y.head())
print("TARGET VARIABLE 'y': ",type(y))
display(y.head())


FEATURES 'X':  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,month,day,hour,week,dayofweek,isweekend,LocationID,precipitation
0,1,1,0,52,6,1,4,0.0
1,1,1,0,52,6,1,12,0.0
2,1,1,0,52,6,1,13,0.0
3,1,1,0,52,6,1,24,0.0
4,1,1,0,52,6,1,41,0.0


0    136.0
1      3.0
2    103.0
3     94.0
4    136.0
Name: NoOfPickups, dtype: float64

TARGET VARIABLE 'y':  <class 'pandas.core.series.Series'>


0    136.0
1      3.0
2    103.0
3     94.0
4    136.0
Name: NoOfPickups, dtype: float64

## 3. Split data set

I will use **train_test_split** from sklearn twice to split the data in:
- Train: 60%
- Validation: 20% (to validate training)
- Test: 20% (to validate de model)

In [8]:
# 2. Split data set
# trainSize = 0.6
valSize = 0.25 #0.8 * 0.2 = 0.2
testSize = 0.2


# Split TRAIN-TEST
X_train, X_test, y_train, y_test \
    = train_test_split(X,y,test_size=testSize,random_state=1)

# Split TRAIN-VALIDATION
X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=valSize, random_state=1)

In [11]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Import dataset
df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"LocationID" : "object"},
                 parse_dates=['datetime'])

# 2. Separate variables
X = df[['month','day','hour','week','dayofweek','isweekend','LocationID','precipitation']]
y = df['NoOfPickups']
type(X), type(y) # 'X' must be DataFrame and 'y' must be a Pandas Series

# 2. Split data set
# trainSize = 0.6
valSize = 0.25 #0.8 * 0.2 = 0.2
testSize = 0.2


# 3.1 Split TRAIN-TEST
X_train, X_test, y_train, y_test \
    = train_test_split(X,y,test_size=testSize,random_state=1)

# 3.2 Split TRAIN-VALIDATION
X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=valSize, random_state=1) 

## 4. Models

As the output variable is a number we need to solve a **regression** problem.

I will start with the simplest regression model: **LINEAR REGRESSION**

date time object into unix-tipe format

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586920 entries, 0 to 586919
Data columns (total 11 columns):
datetime         586920 non-null datetime64[ns]
month            586920 non-null int64
day              586920 non-null int64
hour             586920 non-null int64
LocationID       586920 non-null object
NoOfPickups      586920 non-null float64
year             586920 non-null int64
week             586920 non-null int64
dayofweek        586920 non-null int64
isweekend        586920 non-null int64
precipitation    586920 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(7), object(1)
memory usage: 49.3+ MB


### Linear Regression

score() Returns the coefficient of determination R^2 of the prediction.<br>
Values from 0 to 1. Best is 1.<br>
My model has a very poor score....

In [13]:
from sklearn.linear_model import LinearRegression

# 4.1 Create an instance of the model and train it
reg = LinearRegression().fit(X_train, y_train)

# Do predictions
reg_y_pred = reg.predict(X_val)
print(reg.score(X_train, y_train))
print(reg.score(X_val, y_val))
print(reg.score(X_test, y_test))

0.105383898423693
0.1026728998998584
0.10280433594331106


In [14]:
X_val.head()

Unnamed: 0,month,day,hour,week,dayofweek,isweekend,LocationID,precipitation
397589,9,5,6,36,1,0,74,0.0
351117,8,7,8,32,0,0,158,0.41
202628,5,7,0,18,6,1,113,0.0
381299,8,26,3,34,5,1,13,0.0
305923,7,10,6,28,0,0,12,0.0


### K Nearest Neighbour Regressor (KNN)
Parameters:

   **k**: number of neighbors <br>
   **weight**: way to give more weight to points which are nearby and less weight to the points which are farther away.<br>
- 'uniform': all the same weight.<br>
- 'distance': weighted average per distance.<br>
- 'Custom': weighted average provided by user<br>


In [15]:
from sklearn.neighbors import KNeighborsRegressor

# Create an instance.
# Define number of neighbors.
# weights possible values: 'uniform', 'distance', [callable] user defined function
regk = KNeighborsRegressor(n_neighbors=2, weights = 'uniform')

# Train the data
regk.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='uniform')

## MAE

Measures average magnitud of the errors without considering their direction (all errors in absolute value). It is intuitive to calculate, but you lose information related to the magnitud of the error.<br>
Units are the same as the target variable.<br>
Value range from 0 to infinite. Lower values are better.

In [16]:
from sklearn.metrics import mean_absolute_error

# Use against predictions
MAE = mean_absolute_error(reg.predict(X_val), y_val)
MAE

144.94276638135628

## MAPE

Similar to MAE but it measures the error in percentage.<br>
Lower values are better.<br>
MAPE is not in sklearn so I calculate it MANUALLY with pandas:

In [17]:
MAPE = np.mean(np.abs(reg.predict(X_val) - y_val)/y_val)
MAPE

inf

## RMSE

Measures average magnitud of errors.<br>
Units are the same as the target variable.<br>
Value range from 0 to infinite. Lower values are better.

In [18]:
from sklearn.metrics import mean_absolute_error

# Use against predictions (we have to calculate the squared root of MSE)
RMSE = np.sqrt(mean_absolute_error(reg.predict(X_val), y_val))
RMSE

12.039217847574497

In [19]:
print('MEAN: ', y_val.mean())
print('MEDIAN: ', y_val.median())
print('MAE: ', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)

MEAN:  174.44625332242896
MEDIAN:  94.0
MAE:  144.94276638135628
MAPE:  inf
RMSE:  12.039217847574497


## Correlation

**There should be a strong correlation between predictions and real values.<br>
However, I get a very week correlation value of 0.31**

In [20]:
np.corrcoef(reg.predict(X_val), y_val)[0][1]

0.32048086476925547

## Bias
It is the average of errors (prediction values minus real values).<br>
Negative errors will compensate positive ones.

In [21]:
bias = np.mean(reg.predict(X_val) - y_val)
bias

0.4365067108034941

## Variance
Is the average of errors in predictions between two different data sets.

In [22]:
# create table from X_train with same number of rows as X_val
# so it can be used to calculate the variance
X_train_for_variance = X_train.head(X_val.shape[0])

variance = np.mean(reg.predict(X_train_for_variance) - reg.predict(X_val))
variance

0.08122291046361776

In [23]:
# Merge X_train with predictions
combine = X_train.copy()
combine['NoOfPickups'] = y_train
combine['pickups Prediction'] = reg.predict(X_train)

combine.head(100)

Unnamed: 0,month,day,hour,week,dayofweek,isweekend,LocationID,precipitation,NoOfPickups,pickups Prediction
233539,5,26,5,21,4,0,186,0.05,166.0,155.683761
567551,12,19,22,51,1,0,244,0.0,28.0,292.270009
20598,1,13,19,2,4,0,141,0.0,488.0,253.520493
241011,5,30,21,22,1,0,75,0.01,69.0,207.391581
247179,6,3,17,22,5,1,90,0.0,407.0,186.63762
246256,6,3,3,22,5,1,143,0.0,28.0,101.824769
130727,3,23,7,12,3,0,68,0.0,289.0,109.222586
532594,11,28,5,48,1,0,74,0.0,35.0,70.093928
56410,2,5,1,5,6,1,249,0.0,994.0,150.637183
70791,2,14,0,7,1,0,162,0.0,251.0,93.356283


In [24]:
# Merge X_test with predictions
combine2 = X_test.copy()
combine2['NoOfPickups'] = y_test
combine2['pickups Prediction'] = reg.predict(X_test)

combine2.head(100)

Unnamed: 0,month,day,hour,week,dayofweek,isweekend,LocationID,precipitation,NoOfPickups,pickups Prediction
190258,4,29,7,17,5,1,194,0.0,1.0,154.239011
88815,2,25,5,8,5,1,163,0.0,49.0,127.019647
376609,8,23,5,34,2,0,13,0.0,19.0,51.71263
520565,11,20,17,47,0,0,166,0.0,147.0,208.646256
329609,7,24,23,30,0,0,153,0.0,0.0,255.143502
344968,8,3,12,31,3,0,231,0.0,300.0,229.305366
434099,9,27,23,39,2,0,43,0.0,83.0,207.18217
350313,8,6,20,31,6,1,158,0.0,160.0,246.369295
218510,5,16,21,20,1,0,120,0.0,0.0,234.219014
561757,12,16,8,50,5,1,141,0.0,286.0,126.816376


In [35]:
predictions = reg.predict(X_test)
X_test_g = X_test.groupby(['month', 'day','hour','LocationID']).sum()
X_test_g.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HourlyPrecipitation
month,day,hour,LocationID,Unnamed: 4_level_1
1,1,0,125,0.0
1,1,0,128,0.0
1,1,0,141,0.0
1,1,0,202,0.0
1,1,0,231,0.0
1,1,0,232,0.0
1,1,0,236,0.0
1,1,0,24,0.0
1,1,0,244,0.0
1,1,0,263,0.0


## Pack model with Pickle

In [28]:
import pickle
pickle.dump(reg,open('./model_reg_01.pickle','wb'))