# Uber Fare / Cab Fare Prediction

In [39]:
## importing the important library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [40]:
## loading the data set 
dataset_uber = pd.read_csv("uber.csv")
df = dataset_uber.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194.0,52:06.0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1.0
1,27835199.0,04:56.0,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1.0
2,44984355.0,45:00.0,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1.0
3,25894730.0,22:21.0,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3.0
4,17610152.0,47:00.0,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5.0


In [41]:
df.columns.size

9

In [42]:
df.columns

Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10257 entries, 0 to 10256
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10002 non-null  float64
 1   key                10002 non-null  object 
 2   fare_amount        10002 non-null  float64
 3   pickup_datetime    10002 non-null  object 
 4   pickup_longitude   10002 non-null  float64
 5   pickup_latitude    10002 non-null  float64
 6   dropoff_longitude  10002 non-null  float64
 7   dropoff_latitude   10002 non-null  float64
 8   passenger_count    10002 non-null  float64
dtypes: float64(7), object(2)
memory usage: 721.3+ KB


In [44]:
## The columns Unnamed: 0 and key were dropped because they are index/identifier fields and have 
## no statistical or predictive relationship with the target variable

In [45]:
df.drop(['Unnamed: 0', 'key'] , axis= 1, inplace= True)
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1.0
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1.0
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1.0
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3.0
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5.0


In [46]:
df.isnull().sum()

fare_amount          255
pickup_datetime      255
pickup_longitude     255
pickup_latitude      255
dropoff_longitude    255
dropoff_latitude     255
passenger_count      255
dtype: int64

In [47]:
df[df.isnull().any(axis= 1)]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
10002,,,,,,,
10003,,,,,,,
10004,,,,,,,
10005,,,,,,,
10006,,,,,,,
...,...,...,...,...,...,...,...
10252,,,,,,,
10253,,,,,,,
10254,,,,,,,
10255,,,,,,,


In [48]:
##  it is clearly visible that all the feacture have are null so we are going to drop the whole row 
df = df.dropna().reset_index(drop= True)
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [49]:
df.duplicated().sum() ## there is no duplicate data avillable in thhe dataset 

np.int64(0)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   fare_amount        10002 non-null  float64
 1   pickup_datetime    10002 non-null  object 
 2   pickup_longitude   10002 non-null  float64
 3   pickup_latitude    10002 non-null  float64
 4   dropoff_longitude  10002 non-null  float64
 5   dropoff_latitude   10002 non-null  float64
 6   passenger_count    10002 non-null  float64
dtypes: float64(6), object(1)
memory usage: 547.1+ KB


In [51]:
## change the data type of the  pickup_datetime	feacture  from obj to datetime so that we can change 
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [52]:
df.dtypes

fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                  float64
dtype: object

In [53]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1.0
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1.0
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1.0
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3.0
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5.0


In [54]:
## now time to convert the datetime into different colms like date , month , year , time taken 
df = df.assign(
    hour=df.pickup_datetime.dt.hour,
    day=df.pickup_datetime.dt.day,
    month=df.pickup_datetime.dt.month,
    year=df.pickup_datetime.dt.year,
    dayofweek=df.pickup_datetime.dt.dayofweek
)

In [55]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,year,dayofweek
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1.0,19,7,5,2015,3
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1.0,20,17,7,2009,4
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1.0,21,24,8,2009,0
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3.0,8,26,6,2009,4
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5.0,17,28,8,2014,3


In [56]:
## now this feture is useless for us we are going to drop the feature pickup_datetime	
df.drop(["pickup_datetime"] , axis= 1 , inplace=True)

In [57]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,year,dayofweek
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1.0,19,7,5,2015,3
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1.0,20,17,7,2009,4
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1.0,21,24,8,2009,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3.0,8,26,6,2009,4
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5.0,17,28,8,2014,3


In [58]:
## one of the measure work is how to convert the longitude and latitude into the distance for that we are going to use the  Haversine Formula

In [59]:
from math import *

In [60]:
# Haversine Formula
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km

    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

df['distance_km'] = haversine_distance(
    df['pickup_latitude'],
    df['pickup_longitude'],
    df['dropoff_latitude'],
    df['dropoff_longitude']
)


In [61]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,year,dayofweek,distance_km
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1.0,19,7,5,2015,3,1.683323
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1.0,20,17,7,2009,4,2.45759
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1.0,21,24,8,2009,0,5.036377
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3.0,8,26,6,2009,4,1.661683
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5.0,17,28,8,2014,3,4.47545


In [63]:
## As we calculate the distance of the pickup and drop point we are going to  drop the latitude and longitude of the given table
df.drop(['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude',] , axis= 1 , inplace=  True)

In [64]:
df.head() 

Unnamed: 0,fare_amount,passenger_count,hour,day,month,year,dayofweek,distance_km
0,7.5,1.0,19,7,5,2015,3,1.683323
1,7.7,1.0,20,17,7,2009,4,2.45759
2,12.9,1.0,21,24,8,2009,0,5.036377
3,5.3,3.0,8,26,6,2009,4,1.661683
4,16.0,5.0,17,28,8,2014,3,4.47545


## Exploratory Data Analysis (EDA) 

In [73]:
## NOW WE ARE GOING TO SEE THE RELATIONSHIP BETWEEN THE distance and the fair price 
df.describe()

Unnamed: 0,fare_amount,passenger_count,hour,day,month,year,dayofweek,distance_km
count,10002.0,10002.0,10002.0,10002.0,10002.0,10002.0,10002.0,10002.0
mean,11.479931,1.666767,13.477604,15.608878,6.231854,2011.743151,3.046491,18.650844
std,10.417909,1.290297,6.53947,8.672893,3.430708,1.863429,1.934829,349.948253
min,2.5,0.0,0.0,1.0,1.0,2009.0,0.0,0.0
25%,6.0,1.0,9.0,8.0,3.0,2010.0,1.0,1.23054
50%,8.5,1.0,14.0,16.0,6.0,2012.0,3.0,2.13721
75%,12.5,2.0,19.0,23.0,9.0,2013.0,5.0,3.92666
max,350.0,6.0,23.0,31.0,12.0,2015.0,6.0,8666.772408
