1) Predict the price of the Uber ride from a given pickup point to the agreed drop-off location. 
Perform following tasks: 
1. Pre-process the dataset. 
2. Identify outliers. 
3. Check the correlation. 
4. Implement linear regression and random forest regression models. 
5. Evaluate the models and compare their respective scores like R2, RMSE, etc.
Dataset link: https://www.kaggle.com/datasets/yasserh/uber-fares-dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("uber.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [5]:
df.columns

Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [11]:
df=df.drop(['Unnamed: 0', 'key'], axis=1)

In [12]:
df.shape

(200000, 7)

In [13]:
df.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [14]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [15]:
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [16]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(), inplace=True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].mean(), inplace=True)

In [18]:
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [21]:
corr=df.corr()

In [22]:
corr

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
fare_amount,1.0,0.010457,-0.008481,0.008986,-0.011014,0.01015
pickup_longitude,0.010457,1.0,-0.816461,0.833026,-0.846324,-0.000414
pickup_latitude,-0.008481,-0.816461,1.0,-0.774787,0.702367,-0.00156
dropoff_longitude,0.008986,0.833026,-0.774787,1.0,-0.91701,3.3e-05
dropoff_latitude,-0.011014,-0.846324,0.702367,-0.91701,1.0,-0.000659
passenger_count,0.01015,-0.000414,-0.00156,3.3e-05,-0.000659,1.0


In [23]:
x=df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude','passenger_count']]
y=df['fare_amount']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(x,y,test_size=0.33)


In [25]:
from sklearn.linear_model import LinearRegression
regression=LinearRegression()
regression.fit(X_train, Y_train)
prediction=regression.predict(X_test)
print(prediction)

[11.36824412 11.28971947 11.29160638 ... 11.28819072 11.28860364
 11.36950219]


In [26]:
Y_test

5896      40.33
37927      7.50
94586      8.00
182570     5.70
32936     20.00
          ...  
171651    18.00
104236    10.50
26293     14.90
179713     4.90
173930     6.50
Name: fare_amount, Length: 66000, dtype: float64

In [30]:
from sklearn.metrics import r2_score, mean_squared_error
print(r2_score(Y_test, prediction))
MSE=mean_squared_error(Y_test, prediction)
print(MSE)
print(np.sqrt(MSE))

9.131343657953206e-05
95.63473042116023
9.779301121305153


In [32]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)

y_pred=rf.predict(X_test)
print(y_pred)

[32.5246  6.635   7.448  ... 14.18    5.637   8.191 ]


In [34]:
print(r2_score(Y_test, y_pred))
mser=mean_squared_error(Y_test, y_pred)
print(mser)
print(np.sqrt(mser))

0.765710321419871
22.408276428199397
4.733738103042816
