# Importing All libraries

In [73]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import statsmodels.api as sm


#read xlsx file

In [74]:
df = pd.read_excel('/content/uber_rides_data.xlsx')
df

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


# shape of dataset

In [75]:
df.shape

(200000, 8)

# datatype of each column

In [76]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

# check null values of each column

In [77]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

# convert object to datetime

In [78]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])


#after converting check datatypes

In [79]:
df.dtypes


ride_id                            int64
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [72]:
#df['pickup_datetime'].astype('datetime64[ns]')


#drop null values

In [80]:
df.dropna(inplace=True)

#checking null values

In [81]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

#  calculate average fare amount

In [82]:
avg_fare = df['fare_amount'].mean()
print("average fare amount:", avg_fare)


average fare amount: 11.359891549457748


# Calculate distance between each pickup and dropoff points using Haversine formula.
#the median haversine distance between pickup and dropoff location according to the given dataset

In [83]:
def haversine(lat1, lon1, lat2, lon2):
    #conerting latitude and longitude degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    # haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    #radius of earth in km
    r = 6371.0
    distance = r * c
    return distance


In [84]:
df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


In [85]:
median_haversine_distance = df['haversine_distance'].median()
print("median haversine distance:", median_haversine_distance, "kilometers")


median haversine distance: 2.120992396182902 kilometers


# the maximum haversine distance between pickup and dropoff location according to the given dataset




In [86]:
max_haversine_distance = df['haversine_distance'].max()
print("maximum haversine distance:", max_haversine_distance, "kilometers")


maximum haversine distance: 16409.23913531317 kilometers


# How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset

In [87]:
zero_distance_rides = (df['haversine_distance'] == 0.0).sum()
print("number of rides with 0.0 Haversine distance:", zero_distance_rides)


number of rides with 0.0 Haversine distance: 5632


# mean 'fare_amount' for rides with 0 haversine distance


In [89]:
mean_fare_amount_zero_distance = df[df['haversine_distance'] == 0.0]['fare_amount'].mean()
print("mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)


mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


# maximum 'fare_amount' for a ride


In [90]:
max_fare_amount = df['fare_amount'].max()
print("maximum 'fare_amount' for a ride:", max_fare_amount)


maximum 'fare_amount' for a ride: 499.0


# haversine distance between pickup and dropoff location for the costliest ride

In [93]:
# the highest 'fare_amount
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

# haversine distance for the costliest ride
costliest_ride_distance = costliest_ride['haversine_distance'].values[0]

print("haversine distance for the costliest ride:", costliest_ride_distance, "kilometers")


haversine distance for the costliest ride: 0.0007899213191009993 kilometers


# How many rides were recorded in the year 2014


In [95]:
# convert the 'pickup_datetime' to a datetime data type
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# extract the year from the 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year

# number of rides in the year 2014
rides_in_2014 = len(df[df['pickup_year'] == 2014])

print("number of rides recorded in the year 2014:", rides_in_2014)


number of rides recorded in the year 2014: 29968


# How many rides were recorded in the first quarter of 2014

In [96]:
# extract the year and quarter from the 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter
# count the number of rides in the first quarter of 2014
rides_in_Q1_2014 = len(df[(df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)])

print("number of rides recorded in the first quarter of 2014:", rides_in_Q1_2014)


number of rides recorded in the first quarter of 2014: 7687


# On which day of the week in September 2010, maximum rides were recorded

In [97]:
# extract the day of the week from the 'pickup_datetime'
df['day_of_week'] = df['pickup_datetime'].dt.day_name()

# for rides in september 2010
september_2010_rides = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)]

# day of the week with the maximum rides in September 2010
max_rides_day = september_2010_rides['day_of_week'].mode()[0]

print("day of the week with the maximum rides in september 2010:", max_rides_day)


day of the week with the maximum rides in september 2010: Thursday


In [98]:
#relevant features from 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_second'] = df['pickup_datetime'].dt.second



In [102]:
# one-hot encoding for 'ride_week_day'
df = pd.get_dummies(df, columns=['day_of_week'], prefix='day')

# define X and y
X = df[['passenger_count', 'haversine_distance', 'day_Monday', 'day_Tuesday', 'day_Wednesday', 'day_Thursday', 'day_Friday']]
y = df['fare_amount']

In [103]:
df.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'haversine_distance', 'pickup_year',
       'pickup_quarter', 'pickup_month', 'pickup_day', 'pickup_hour',
       'pickup_minute', 'pickup_second', 'day_Friday', 'day_Monday',
       'day_Saturday', 'day_Sunday', 'day_Thursday', 'day_Tuesday',
       'day_Wednesday'],
      dtype='object')

# Apply a Machine Learning Algorithm to predict the fare amount given following input features:
# passenger_count, distance and ride_week_day.

# Perform a 70-30 split of data.

# Which algorithm gives the least adjusted R square value?

In [104]:

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize and train regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # calculate adjusted R-squared
    n = len(X_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2

# the model with the least adjusted R-squared
least_adjusted_r2_model = min(adjusted_r2_scores, key=adjusted_r2_scores.get)

print("model with the least adjusted R-squared:", least_adjusted_r2_model)


Model with the least adjusted R-squared: Linear Regression


In [105]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize and train regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # calculate adjusted R-squared
    n = len(X_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2

    print(f"{model_name}:")
    print(f"R-squared: {r2:.2f}")
    print(f"Adjusted R-squared: {adjusted_r2:.2f}")
    print()


Linear Regression:
R-squared: 0.00
Adjusted R-squared: 0.00

Decision Tree Regression:
R-squared: 0.48
Adjusted R-squared: 0.48

Random Forest Regression:
R-squared: 0.63
Adjusted R-squared: 0.63

KNN Regression:
R-squared: 0.63
Adjusted R-squared: 0.63



In [69]:
0.00<0.48<0.62<0.63

True