In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Copy code over from other notebooks to import and clean three month data -

In [2]:
jan_20 = pd.read_feather("../preprocessed_data/clean_jan.feather").dropna().drop('index', axis=1)
feb_20 = pd.read_feather("../preprocessed_data/clean_feb.feather").dropna().drop('index', axis=1)
mar_20 = pd.read_feather("../preprocessed_data/clean_mar.feather").dropna().drop('index', axis=1)

In [3]:
# Function to preprocess & filter data read in from a csv file

def preprocess(df):
    full_drop_labels = ['VendorID', 'store_and_fwd_flag', 'extra', 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge']
    df.drop(full_drop_labels, axis = 1, inplace = True)
    df = df.loc[df['payment_type'] != 5]
    df = df.loc[df['RatecodeID'] != 99]
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df.dropna(subset = ['RatecodeID', 'payment_type'], inplace = True)
    df = df.loc[(df['total_amount'] > 0) & (df['trip_distance'] > 0) & (df['total_amount'] < 500) & (df['passenger_count'] > 0)]
    
    return df

Since 2016 weather was the only dataset available, we'll use that

In [4]:
weather = pd.read_csv('../raw_data/weather_data_nyc_centralpark_2016.csv', encoding='utf8', engine='python')

In [5]:
weather.tail()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
361,27-12-2016,60,40,50.0,0.0,0,0
362,28-12-2016,40,34,37.0,0.0,0,0
363,29-12-2016,46,33,39.5,0.39,0,0
364,30-12-2016,40,33,36.5,0.01,T,0
365,31-12-2016,44,31,37.5,0.0,0,0


In [6]:
columns = ['maximum temperature', 'minimum temperature', 'snow depth']
weather.drop(columns = columns, inplace = True)

In [7]:
weather.shape

(366, 4)

In [8]:
placeholder = jan_20.append(feb_20)
three_month_20 = placeholder.append(mar_20)

In [9]:
three_month_20.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,total_amount
0,2020-01-01 00:28:15,2020-01-01 00:33:03,1.0,1.2,1.0,238,239,1.0,6.0,1.47,11.27
1,2020-01-01 00:35:39,2020-01-01 00:43:04,1.0,1.2,1.0,239,238,1.0,7.0,1.5,12.3
2,2020-01-01 00:47:41,2020-01-01 00:53:52,1.0,0.6,1.0,238,238,1.0,6.0,1.0,10.8
3,2020-01-01 00:55:23,2020-01-01 01:00:14,1.0,0.8,1.0,238,151,1.0,5.5,1.36,8.16
4,2020-01-01 00:01:58,2020-01-01 00:04:16,1.0,0.0,1.0,193,193,2.0,3.5,0.0,4.8


In [11]:
three_month_20['date'] = three_month_20['tpep_pickup_datetime'].dt.date

In [12]:
three_month_20.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,total_amount,date
0,2020-01-01 00:28:15,2020-01-01 00:33:03,1.0,1.2,1.0,238,239,1.0,6.0,1.47,11.27,2020-01-01
1,2020-01-01 00:35:39,2020-01-01 00:43:04,1.0,1.2,1.0,239,238,1.0,7.0,1.5,12.3,2020-01-01
2,2020-01-01 00:47:41,2020-01-01 00:53:52,1.0,0.6,1.0,238,238,1.0,6.0,1.0,10.8,2020-01-01
3,2020-01-01 00:55:23,2020-01-01 01:00:14,1.0,0.8,1.0,238,151,1.0,5.5,1.36,8.16,2020-01-01
4,2020-01-01 00:01:58,2020-01-01 00:04:16,1.0,0.0,1.0,193,193,2.0,3.5,0.0,4.8,2020-01-01


In [13]:
temp = three_month_20.groupby('date').mean()
temp.head()

Unnamed: 0_level_0,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,total_amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003-01-01,1.0,0.0,1.0,193.0,193.0,2.0,0.0,0.0,0.0
2008-12-31,1.571429,5.545714,1.333333,175.666667,162.952381,1.666667,21.47619,1.322381,26.58619
2009-01-01,1.375,3.693393,1.303571,153.607143,159.875,1.892857,18.409286,0.391429,22.254643
2019-12-18,2.5,0.0,3.0,193.0,193.0,1.0,1.255,0.0,4.555
2019-12-31,2.224806,3.209845,1.085271,153.131783,150.356589,1.364341,12.682171,1.85,18.008372


In [14]:
y = temp['total_amount']

In [None]:
# T values represent nulls. Replace with 0

In [16]:
weather.replace(to_replace = 'T', value = 0.0, inplace = True)
X = weather.drop(columns = 'date')
X

Unnamed: 0,average temperature,precipitation,snow fall
0,38.0,0.00,0.0
1,36.0,0.00,0.0
2,40.0,0.00,0.0
3,25.0,0.00,0.0
4,20.0,0.00,0.0
...,...,...,...
361,50.0,0,0
362,37.0,0,0
363,39.5,0.39,0
364,36.5,0.01,0


In [17]:
# Slice to match up available data

X = X.iloc[0:124]
X

Unnamed: 0,average temperature,precipitation,snow fall
0,38.0,0.00,0.0
1,36.0,0.00,0.0
2,40.0,0.00,0.0
3,25.0,0.00,0.0
4,20.0,0.00,0.0
...,...,...,...
119,51.5,0.05,0.0
120,55.5,0.00,0.0
121,48.0,0.16,0.0
122,52.5,0.04,0.0


In [18]:
# Begin model fitting -

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 637)

In [19]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = lm.predict(X_test)

In [21]:
print('Beta Coefficients:', lm.coef_)
print('\nMSE:', mean_squared_error(y_test, y_pred))

Beta Coefficients: [-0.04830602  0.87116368 -0.10222317]

MSE: 11.936228373792565
