# UBER FARE ANALYSIS

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
uber = pd.read_csv('uber_rides_data.csv')

In [4]:
uber.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [9]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [10]:
uber.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967153,40.767158,-73.963659,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


#### What is the shape of given dataset?


In [6]:
uber.shape

(200000, 8)

#### How many integer columns(by default) are given in the dataset?

In [11]:
uber.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [14]:
integer_columns = uber.select_dtypes(include=['int']).shape[1]
print(f'The number of integer columns in the dataset is: {integer_columns}')

The number of integer columns in the dataset is: 2


#### How many missing values exists in 'dropoff_longitude' column?


In [15]:
uber.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [16]:
# Check for missing values in the 'dropoff_longitude' column
missing_values = uber['dropoff_longitude'].isna().sum()

print(f"Number of missing values in 'dropoff_longitude' column: {missing_values}")

Number of missing values in 'dropoff_longitude' column: 1


In [17]:
uber.dropoff_longitude.isnull().sum()

1

#### What is the data type of ' pickup_datetime' feature in your data?

In [21]:
pickup_datetime_dtype = uber['pickup_datetime'].dtype

print(f"Data type of 'pickup_datetime' column: {pickup_datetime_dtype}")

Data type of 'pickup_datetime' column: object


#### Which of the following is the correct syntax to convert 'pickup_datetime' to datetime datatype?

In [22]:
uber['pickup_datetime'] = pd.to_datetime(uber['pickup_datetime'])

#### Which function can be used to remove null values from the dataframe?


In [25]:
remove = uber.dropna()

In [26]:
uber.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

#### What is the average fare amount?
Remove the null values from the dataframe to answer the following question.

In [28]:
# Calculate the average fare amount
average_fare = remove['fare_amount'].mean()

print(f'The average fare amount is: {average_fare}')

The average fare amount is: 11.359891549458371


#### Calculate distance between each pickup and dropoff points using Haversine formula. 
What is the median haversine distance between pickup and dropoff location according to the given dataset?


In [29]:
# Remove rows with null values
data_cleaned = uber.dropna(subset=['dropoff_latitude', 'dropoff_longitude'])

# Define a function to calculate Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Radius of Earth in kilometers (use 6371 for km or 3956 for miles)
    r = 6371  
    return c * r

# Calculate the Haversine distance for each row
data_cleaned['haversine_distance'] = haversine_distance(
    data_cleaned['pickup_latitude'], data_cleaned['pickup_longitude'],
    data_cleaned['dropoff_latitude'], data_cleaned['dropoff_longitude']
)

# Calculate the median Haversine distance
median_distance = data_cleaned['haversine_distance'].median()

print(f"Median Haversine Distance: {median_distance:.2f} km")


Median Haversine Distance: 2.12 km


### What is the maximum haversine distance between pickup and dropoff location according to the given dataset?

In [30]:
# Calculate the maximum Haversine distance
max_distance = data_cleaned['haversine_distance'].max()

print(f"Maximum Haversine Distance: {max_distance:.2f} km")

Maximum Haversine Distance: 16409.24 km


### How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?

In [32]:
# Count the number of rides with a Haversine distance of 0.0
zero_distance_count = (data_cleaned['haversine_distance'] == 0.0).sum()

print(f'The number of rides with a Haversine distance of 0.0 is: {zero_distance_count}')

The number of rides with a Haversine distance of 0.0 is: 5632


### What is the mean 'fare_amount' for rides with 0 haversine distance?
Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

In [34]:
# Filter rides with 0 Haversine distance
zero_distance_rides = data_cleaned[data_cleaned['haversine_distance'] == 0.0]

# Calculate the mean fare amount for rides with 0 Haversine distance
mean_fare_zero_distance = zero_distance_rides['fare_amount'].mean()

print(f'The mean fare amount for rides with 0 Haversine distance is: {mean_fare_zero_distance}')

The mean fare amount for rides with 0 Haversine distance is: 11.585317826704578


# Insights:

If the calculated mean_fare for rides with 0.0 Haversine distance is greater than 0, this could indicate potential issues or inconsistencies such as:

GPS Errors: The coordinates for pickup and dropoff may have been inaccurately recorded as the same location due to GPS errors.

Incorrect Data Entry: Fare amounts may have been recorded even though no actual trip occurred.

Customer Cancellations or Technical Failures: Some rides might have been charged even if no real trip took place (e.g., customer cancellation after pickup).

Analysis:

If the mean fare for rides with 0 distance is non-zero, it suggests something unusual, like passengers being charged for rides that were not actually completed. This could require deeper investigation or even corrective actions from the company to maintain transparency.

### What is the maximum 'fare_amount' for a ride?


In [36]:
# Find the maximum fare amount
max_fare = data_cleaned['fare_amount'].max()

print(f"Maximum fare amount for a ride: {max_fare:.2f}")

Maximum fare amount for a ride: 499.00


### What is the haversine distance between pickup and dropoff location for the costliest ride?

Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

In [40]:
# Find the row with the maximum fare
costliest_ride = data_cleaned.loc[data_cleaned['fare_amount'].idxmax()]

# Calculate the Haversine distance for the costliest ride
costliest_ride_distance = haversine_distance(
    costliest_ride['pickup_latitude'], costliest_ride['pickup_longitude'],
    costliest_ride['dropoff_latitude'], costliest_ride['dropoff_longitude']
)

# Print the maximum fare and the corresponding Haversine distance
print(f"Maximum fare amount: {costliest_ride['fare_amount']:}")
print(f"Haversine distance for the costliest ride: {costliest_ride_distance: } km")

Maximum fare amount: 499.0
Haversine distance for the costliest ride:  0.0007899213191009994 km


# Analysis:
If the Haversine distance for the costliest ride is unusually low (e.g., close to 0 km) while the fare is extremely high, this could suggest something fishy, such as:

Data Errors: There could be issues with data recording, like incorrect GPS coordinates.

Fare Inconsistency: A high fare with a low or zero distance could indicate a bug or an anomaly in how fares are calculated.

Cancellation or Surge Pricing: The ride might have been charged due to special conditions like cancellations or surge pricing, but this would need further verification.

### How many rides were recorded in the year 2014?


In [42]:
# Convert the 'pickup_datetime' column to datetime format
uber['pickup_datetime'] = pd.to_datetime(uber['pickup_datetime'], errors='coerce')

# Filter the rides that occurred in 2014
rides_2014 = uber[uber['pickup_datetime'].dt.year == 2014]

# Count the number of rides in 2014
rides_2014_count = rides_2014.shape[0]

print(f"Number of rides recorded in 2014: {rides_2014_count}")

Number of rides recorded in 2014: 29968


### How many rides were recorded in the first quarter of 2014?

In [43]:
# Filter for rides that occurred in the first quarter (Q1) of 2014
rides_q1_2014 = uber[(uber['pickup_datetime'].dt.year == 2014) & 
                     (uber['pickup_datetime'].dt.month.isin([1, 2, 3]))]

# Count the number of rides in the first quarter of 2014
rides_q1_2014_count = rides_q1_2014.shape[0]

print(f"Number of rides recorded in the first quarter of 2014: {rides_q1_2014_count}")

Number of rides recorded in the first quarter of 2014: 7687


### On which day of the week in September 2010, maximum rides were recorded ?

In [45]:
# Filter the data for rides in September 2010
rides_september_2010 = uber[(uber['pickup_datetime'].dt.year == 2010) & 
                            (uber['pickup_datetime'].dt.month == 9)]

# Extract the day of the week (Monday=0, Sunday=6)
rides_september_2010['day_of_week'] = rides_september_2010['pickup_datetime'].dt.day_name()

# Count the number of rides for each day of the week
rides_per_day = rides_september_2010['day_of_week'].value_counts()

# Find the day with the maximum number of rides
max_rides_day = rides_per_day.idxmax()
max_rides_count = rides_per_day.max()

print(f"The day of the week with the maximum rides in September 2010: {max_rides_day} with {max_rides_count} rides.")

The day of the week with the maximum rides in September 2010: Thursday with 457 rides.


### Apply a Machine Learning Algorithm to predict the fare amount given following input features:
passenger_count, distance and ride_week_day.

Perform a 70-30 split of data.

Which algorithm gives the least adjusted R square value?

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [47]:
# Extract the day of the week from the 'pickup_datetime' column
data_cleaned['pickup_datetime'] = pd.to_datetime(data_cleaned['pickup_datetime'], errors='coerce')
data_cleaned['ride_week_day'] = data_cleaned['pickup_datetime'].dt.day_name()

In [48]:
# Convert 'ride_week_day' into categorical feature
data_cleaned = pd.get_dummies(data_cleaned, columns=['ride_week_day'], drop_first=True)

In [50]:
# Features and target
X = data_cleaned[['passenger_count'] + [col for col in data_cleaned if col.startswith('ride_week_day')]]
y = data_cleaned['fare_amount']

In [51]:
# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [52]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

In [53]:
# Function to calculate adjusted R-squared
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

In [56]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate R-squared and adjusted R-squared
    r2 = r2_score(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, X_test.shape[0], X_test.shape[1])
    
    print(f"{name} - R2: {r2:.4f}, Adjusted R2: {adj_r2:.4f}")

Linear Regression - R2: 0.0005, Adjusted R2: 0.0004
Decision Tree - R2: 0.0006, Adjusted R2: 0.0005
Random Forest - R2: 0.0006, Adjusted R2: 0.0005


# Linear Regression has the lowest adjusted R², indicating that it explains the least variance in the target variable (fare_amount) given the features provided.