In [None]:
# Imports
# Packages for numerics + dataframes
import pandas as pd
import numpy as np

# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for date conversions for calculating trip durations
from datetime import datetime
from datetime import date
from datetime import timedelta

# Packages for OLS, MLR, confusion matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics # For confusion matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [None]:
import requests
app_token='YOUR TOKEN'
url = f'https://data.cityofnewyork.us/resource/biws-g3hs.json?$$app_token={app_token}'
response = requests.get(url)
data = response.json()


In [None]:
df0= pd.DataFrame(data)
df=df0.copy()

In [None]:
df.shape

In [None]:
df.drop_duplicates().shape

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df['tpep_dropoff_datetime'][3]

In [None]:
df['tpep_dropoff_datetime'].dtype

In [None]:
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

In [None]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])/np.timedelta64(1,'m')


In [None]:
df.info()


In [None]:
columns=['vendorid','pulocationid','dolocationid']
for x in columns:
  df[x] = df[x].astype(int)
columns=['trip_distance','fare_amount']
for x in columns:
  df[x] = df[x].astype(float)




In [None]:
df.info()


In [None]:
fig,axes = plt.subplots(1,3,figsize=(15,2))
fig.suptitle('Boxplots for outlier detection')
sns.boxplot(ax=axes[0],x=df['trip_distance'])
sns.boxplot(ax=axes[1], x=df['fare_amount'])
sns.boxplot(ax=axes[2], x=df['duration'])
plt.show();

Fare amount outliers

In [None]:
def outlier_imputer(col,iqr_factor):
    df.loc[df['fare_amount']<0] =0
    q1=df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    iqr=q3-q1
    upper_threshold = q3 + (iqr_factor * iqr)
    df.loc[df[col] > upper_threshold, col] = upper_threshold
    print(df[col].describe())
outlier_imputer('fare_amount',6)

duration outliers


In [None]:
outlier_imputer('duration',6)

In [None]:
df['pickup_dropoff'] = df['pulocationid'].astype(str)+' '+df['dolocationid'].astype(str)
df['pickup_dropoff'].head()

Mean distance

In [None]:
grouped = df.groupby('pickup_dropoff').mean()[['trip_distance']]
grouped[:5]
df.drop(df[df['vendorid'] == 0].index, inplace=True)
grouped_dict = grouped.to_dict()
grouped_dict = grouped_dict['trip_distance']
df['mean_distance'] = df['pickup_dropoff']


df['mean_distance'] = df['mean_distance'].map(grouped_dict)

df[(df['pulocationid']==100) & (df['dolocationid']==148)][['mean_distance']]

Mean Duration

In [None]:
grouped = df.groupby('pickup_dropoff').mean()[['duration']]
grouped_dict = grouped.to_dict()
grouped_dict = grouped_dict['duration']

df['mean_duration'] = df['pickup_dropoff']
df['mean_duration'] = df['mean_duration'].map(grouped_dict)

# Confirm that it worked
df[(df['pulocationid']==100) & (df['dolocationid']==148)][['mean_duration']]

Create day and month columns

In [None]:
df['day'] = df['tpep_pickup_datetime'].dt.day_name().str.lower()
df['month'] = df['tpep_pickup_datetime'].dt.strftime('%b').str.lower()
df.head()

In [None]:
df['tpep_pickup_datetime'].value_counts()

Create rush_hour column

In [None]:
df['rush_hour'] = df['tpep_pickup_datetime'].dt.hour
df.loc[df['day'].isin(['Saturday','Sunday']),'rush_hour'] = 0

In [None]:
def rush_hourizer(row):
    if 6<=row['rush_hour'] < 10:
        val=1
    elif 16<=row['rush_hour']<20:
        val=1
    else:
        val = 0
    return val
df['rush_hour'] = df.apply(rush_hourizer,axis=1)
df.head()

In [None]:
df['day'].value_counts()

Scatter plot

In [None]:
sns.set(style='whitegrid')
f = plt.figure()
f.set_figwidth(5)
f.set_figheight(5)
sns.regplot(x=df['mean_duration'], y=df['fare_amount'],
            scatter_kws={'alpha':0.5, 's':5},
            line_kws={'color':'red'})
plt.ylim(0, 70)
plt.xlim(0, 70)
plt.title('Mean duration x fare amount')
plt.show()

The mean_duration variable correlates with the target variable.
But what is the horizontal line around fare amount of 52 dollar?
Check the value of the ride in the horizontal line in the scatter plot.

In [None]:
df[df['fare_amount']==52].head(32)

It seems that almost all of the trips in the first 30 rows where the fare amount was $52 either begin or end at location 132, and all of them have a RatecodeID of 2.

There is no readily apparent reason why PULocation 132 should have so many fares of 52 dollars. They seem to occur on all different days, at different times, with both vendors, in all months. However, there are many toll amounts of
5.54. This would seem to indicate that location 132 is in an area that frequently requires tolls to get to and from. It's likely this is an airport.

The data dictionary says that RatecodeID of 2 indicates trips for JFK, which is John F. Kennedy International Airport. A quick Google search for "new york city taxi flat rate $52" indicates that in 2017 (the year that this data was collected) there was indeed a flat fare for taxi trips between JFK airport (in Queens) and Manhattan

In [None]:
df2 = df.copy()

df2 = df2.drop(['tpep_dropoff_datetime', 'tpep_pickup_datetime',
               'trip_distance', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid', 'dolocationid',
               'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
               'total_amount', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'duration',
               'pickup_dropoff', 'day', 'month','rush_hour'
               ], axis=1)
df2.info()

In [None]:
df2['passenger_count']=df2['passenger_count'].astype(int)

In [None]:
df2.info()


In [None]:
sns.pairplot(df2[['fare_amount','mean_duration','mean_distance']],plot_kws={'alpha':0.4, 'size':5})

In [None]:
df2.corr(method='pearson')

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df2.corr(method='pearson'), annot=True, cmap='Reds')
plt.title('Correlation heatmap',
          fontsize=18)
plt.show()

mean_duration and mean_distance are both highly correlated with the target variable of fare_amount They're also both correlated with each other, with a Pearson correlation of 0.87.

Recall that highly correlated predictor variables can be bad for linear regression models when you want to be able to draw statistical inferences about the data from the model. However, correlated predictor variables can still be used to create an accurate predictor if the prediction itself is more important than using the model as a tool to learn about your data.

This model will predict fare_amount, which will be used as a predictor variable in machine learning models. Therefore, try modeling with both variables even though they are correlated.

**Split data into outcome variable and features**




In [None]:

X = df2.drop(columns='fare_amount')
y=df2[['fare_amount']]

X.head()

In [None]:
X['vendorid'] = X['vendorid'].astype(str)
X = pd.get_dummies(X,drop_first=True)
X.head()

**Normalize the data**


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**Split data into training and test sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

lr = LinearRegression()
lr.fit(X_train,y_train)

**Train data**


In [None]:

r_sq_train = lr.score(X_train,y_train)
print('Coefficient of determination:', r_sq_train)
y_pred_train = lr.predict(X_train)
print('R^2:', r2_score(y_train, y_pred_train))
print('MAE:', mean_absolute_error(y_train, y_pred_train))
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))

**Test data**

In [None]:
r_sq_test = lr.score(X_test, y_test)
print('Coefficient of determination:', r_sq_test)
y_pred_test = lr.predict(X_test)
print('R^2:', r2_score(y_test, y_pred_test))
print('MAE:', mean_absolute_error(y_test,y_pred_test))
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))

**Results**

In [None]:
results = pd.DataFrame(data={'actual': y_test['fare_amount'],
                             'predicted': y_pred_test.ravel()})
results['residual'] = results['actual'] - results['predicted']
results.head()

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.set(style='whitegrid')
sns.scatterplot(x='actual',
                y='predicted',
                data=results,
                s=20,
                alpha=0.5,
                ax=ax
)
plt.plot([0,60], [0,60], c='red', linewidth=2)
plt.title('Actual vs. predicted');

In [None]:
sns.histplot(results['residual'], bins=np.arange(-15,15.5,0.5))
plt.title('Distribution of the residuals')
plt.xlabel('residual value')
plt.ylabel('count');

In [None]:
sns.scatterplot(x='predicted', y='residual', data=results)
plt.axhline(0, c='red')
plt.title('Scatterplot of residuals over predicted values')
plt.xlabel('predicted value')
plt.ylabel('residual value')
plt.show()

In [None]:
coefficients = pd.DataFrame(lr.coef_, columns=X.columns)
coefficients

The coefficients reveal that mean_distance was the feature with the greatest weight in the model's final prediction. For every mile traveled, the fare amount increases by a mean of $6.8. Note, however, that because some highly correlated features were not removed, the confidence interval of this assessment is wider.