## In this notebook I will explore different predictive models

In [3]:
!ls -Rl

.:
total 145968
-rw-rw-rw- 1 angelrps angelrps  550601 Jun 23 07:30 05-visualization_introduction_inclass.ipynb
-rw-rw-rw- 1 angelrps angelrps  189652 Jun 19 09:19 Amadeus_Challenge_Class.ipynb
-rwxrwxrwx 1 angelrps angelrps 7369019 Apr  7 19:34 Data_Analysis.ipynb
-rw-rw-rw- 1 angelrps angelrps  752508 Jun 22 10:02 Data_Analysis_Taxis.ipynb
-rwxrwxrwx 1 angelrps angelrps  371494 Jun 18 11:33 Data_Analysis_Weather.ipynb
-rw-rw-rw- 1 angelrps angelrps 2387352 Jun 23 09:30 Data_Join_Taxis_Weather.ipynb
-rwxrwxrwx 1 angelrps angelrps   10144 Apr  8 17:19 HowToTackleDataScienceChallenge.ipynb
-rw-rw-rw- 1 angelrps angelrps    1429 Jun 23 13:06 Modelling_01.ipynb
-rwxrwxrwx 1 angelrps angelrps 3527394 Jun 12 11:59 TSA.ipynb


In [4]:
cd ../data

/home/angelrps/git/MasterDataScience_FinalProject/data


## 1. Import libraries and dataset

In [2]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Import dataset

df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"PULocationID" : "object"},
                 parse_dates=['datetime'])
df.info() # check dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036854 entries, 0 to 1036853
Data columns (total 13 columns):
datetime               1036854 non-null datetime64[ns]
PULocationID           1036854 non-null object
NoOfPickups            1036854 non-null int64
year                   1036854 non-null int64
month                  1036854 non-null int64
day                    1036854 non-null int64
hour                   1036854 non-null int64
week                   1036854 non-null int64
dayofweek              1036854 non-null int64
isweekend              1036854 non-null int64
IsHoliday              1036854 non-null int64
hourlyperiods          1036854 non-null int64
HourlyPrecipitation    1036854 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(10), object(1)
memory usage: 102.8+ MB


## 2. Separate variables
INPUT variables: (must be Pandas DataFrame)
    - month
    - day
    - hour
    - is weekend
    - PULocationID
    - HourlyPrecipitation
OUTPUT: (must Pandas Series)
    - NoOfPickups

In [5]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Import dataset

df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"PULocationID" : "object"},
                 parse_dates=['datetime'])

# 2. Separate variables
X = df[['month','day','hour','PULocationID','HourlyPrecipitation']]
y = df['NoOfPickups']
type(X), type(y) # 'X' must be DataFrame and 'y' must be a Pandas Series

(pandas.core.frame.DataFrame, pandas.core.series.Series)

## 3. Split data set

I will use **train_test_split** from sklearn twice to split the data in:
- Train: 60%
- Validation: 20% (to validate training)
- Test: 20% (to validate de model)

In [6]:
# 2. Split data set
# trainSize = 0.6
valSize = 0.25 #0.8 * 0.2 = 0.2
testSize = 0.2


# Split TRAIN-TEST
X_train, X_test, y_train, y_test \
    = train_test_split(X,y,test_size=testSize,random_state=1)

# Split TRAIN-VALIDATION
X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=valSize, random_state=1)

In [7]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Import dataset

df = pd.read_csv('../data/Data_Cleaned_2017_To_Model.csv', sep=',',
                 dtype = {"PULocationID" : "object"},
                 parse_dates=['datetime'])

# 2. Separate variables
X = df[['month','day','hour','PULocationID','HourlyPrecipitation']]
y = df['NoOfPickups']
type(X), type(y) # 'X' must be DataFrame and 'y' must be a Pandas Series

# 2. Split data set
# trainSize = 0.6
valSize = 0.25 #0.8 * 0.2 = 0.2
testSize = 0.2


# 3.1 Split TRAIN-TEST
X_train, X_test, y_train, y_test \
    = train_test_split(X,y,test_size=testSize,random_state=1)

# 3.2 Split TRAIN-VALIDATION
X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=valSize, random_state=1) 

## 4. Models

As the output variable is a number we need to solve a **regression** problem.

I will start with the simplest regression model: **LINEAR REGRESSION**

date time object into unix-tipe format

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036854 entries, 0 to 1036853
Data columns (total 13 columns):
datetime               1036854 non-null datetime64[ns]
PULocationID           1036854 non-null object
NoOfPickups            1036854 non-null int64
year                   1036854 non-null int64
month                  1036854 non-null int64
day                    1036854 non-null int64
hour                   1036854 non-null int64
week                   1036854 non-null int64
dayofweek              1036854 non-null int64
isweekend              1036854 non-null int64
IsHoliday              1036854 non-null int64
hourlyperiods          1036854 non-null int64
HourlyPrecipitation    1036854 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(10), object(1)
memory usage: 102.8+ MB


In [9]:
from sklearn.linear_model import LinearRegression

# 4.1 Create an instance of the model
reg = LinearRegression()

# 4.2 Train the regressor
reg.fit(X_train, y_train)

# Do predictions
#reg.predict([[2540],[3500],[4000]])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)