Predict the price of the Uber ride from a given pickup point to the agreed drop-off
location using linear regression model.
A) Pre-process the dataset.
B) Check the correlation
C) Evaluate the models and compare their respective scores like R2, RMSE, etc.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('E:\\dataset\\uber.csv')

In [None]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,8.5,2014-01-01,-73.995383,40.720680,-73.995978,40.736200,2,0.015531
1,26.0,2014-01-01,-73.976075,40.759432,-74.007680,40.740772,6,0.036702
2,22.5,2014-01-01,-73.982268,40.745457,-74.004782,40.706950,1,0.044606
3,5.5,2014-01-01,-73.970427,40.752365,-73.981125,40.747937,2,0.011578
4,17.0,2014-01-01,-73.945873,40.801373,-73.973143,40.743863,1,0.063648
...,...,...,...,...,...,...,...,...
2125,8.5,2014-01-31,-73.967885,40.763388,-73.955567,40.783161,1,0.023296
2126,14.0,2014-01-31,-73.962542,40.772987,-73.977230,40.758917,6,0.020340
2127,8.0,2014-01-31,-73.982534,40.756929,-73.987059,40.760832,1,0.005976
2128,6.0,2014-01-31,-73.982652,40.745070,-73.973608,40.751503,1,0.011099


In [None]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
count,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0
mean,10.468826,-73.979421,40.751971,-73.977264,40.751333,1.712207,0.027154
std,5.379518,0.022166,0.023335,0.023889,0.027058,1.366681,0.019805
min,4.5,-74.017175,40.643685,-74.017943,40.635132,1.0,0.004845
25%,6.5,-73.992403,40.736719,-73.991666,40.735144,1.0,0.013071
50%,9.0,-73.982194,40.753216,-73.980723,40.752414,1.0,0.020994
75%,12.5,-73.9691,40.767158,-73.966678,40.767998,2.0,0.03475
max,33.83,-73.776688,40.849642,-73.736412,40.866837,6.0,0.11494


In [None]:
corr = df.corr()
corr.style.background_gradient(cmap = "BuGn")

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
fare_amount,1.0,0.138584,-0.079318,0.241897,-0.118203,0.002507,0.873414
pickup_longitude,0.138584,1.0,0.383908,0.510784,0.220253,0.010946,0.177867
pickup_latitude,-0.079318,0.383908,1.0,0.156126,0.532864,0.023809,-0.075776
dropoff_longitude,0.241897,0.510784,0.156126,1.0,0.404568,0.012326,0.331917
dropoff_latitude,-0.118203,0.220253,0.532864,0.404568,1.0,0.028554,-0.069195
passenger_count,0.002507,0.010946,0.023809,0.012326,0.028554,1.0,-0.011621
distance,0.873414,0.177867,-0.075776,0.331917,-0.069195,-0.011621,1.0


In [None]:
x = df['distance'].values.reshape(-1, 1)
y = df['fare_amount'].values.reshape(-1, 1)

In [None]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_std = std.fit_transform(x)
y_std = std.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_std, y_std, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
l_reg = LinearRegression()
l_reg.fit(x_train, y_train)
train_score = l_reg.score(x_train, y_train)
test_score = l_reg.score(x_test, y_test)
print("Training set score: ", train_score)
print("Testing set score: ", test_score)

Training set score:  0.7601509717811463
Testing set score:  0.7742213292101866


In [None]:
y_pred = l_reg.predict(x_test)
df = {"Actual": y_test, "Predicted": y_pred}

In [None]:
from sklearn import metrics

print("r2 score: ", metrics.r2_score(y_test, y_pred))
print("Mean sqared error: ", metrics.mean_squared_error(y_test, y_pred))
print("Mean absolute error: ", metrics.mean_absolute_error(y_test, y_pred))
print("Root mean squared error: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

r2 score:  0.7742213292101866
Mean sqared error:  0.21328011425528853
Mean absolute error:  0.33428235687706626
Root mean squared error:  0.4618226004163163


In [None]:
from tabulate import tabulate
print(tabulate(df, headers = "keys", tablefmt = 'pspl'))

     Actual     Predicted
-----------  ------------
 1.02843     -0.184749
-0.366072    -0.379188
-0.830906    -0.469004
 0.470631     0.45027
-0.923873    -0.867893
-0.552005    -0.608699
 0.377664     0.317061
-1.10981     -0.8609
-1.10981     -0.766916
-0.923873    -0.856356
-0.180138    -0.736829
-0.552005    -0.0609121
 0.470631     0.801232
-0.273105     0.131945
 1.02843      0.0469107
-0.366072    -0.293227
-1.01684     -0.863122
-0.366072    -0.16557
-0.366072    -0.0529477
-0.366072    -0.27436
 3.16667      3.29898
-1.01684     -0.737457
-0.830906    -0.65613
-1.10981     -0.780289
 1.02843      1.04228
-0.0871707   -0.0643691
-0.0871707   -0.275547
 0.470631     0.75573
 0.842499     1.11876
 0.842499    -0.101376
 1.4003       1.07681
-0.830906    -0.592355
-1.01684     -0.624765
-0.459039    -0.260432
 0.935466     1.04136
 0.0987632   -0.269977
-0.366072    -0.561207
-0.273105     0.0562603
-0.923873    -0.701588
 0.470631     1.24389
-0.0871707    0.322372
 0.656565    

In [None]:
print(l_reg.intercept_)
print(l_reg.coef_)

[0.0048171]
[[0.87448463]]
