# Importing All libraries

In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import statsmodels.api as sm


#read xlsx file

In [9]:
df = pd.read_excel('/content/uber_rides_data.xlsx')
df

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


# shape of dataset

In [10]:
df.shape

(200000, 8)

# datatype of each column

In [11]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

# check null values of each column

In [12]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

# convert object to datetime

In [13]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])


#after converting check datatypes

In [16]:
df.dtypes


ride_id                            int64
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dtype: object

In [15]:
df['pickup_datetime'].astype('datetime64[ns]')


  df['pickup_datetime'].astype('datetime64[ns]')


0        2015-05-07 19:52:06
1        2009-07-17 20:04:56
2        2009-08-24 21:45:00
3        2009-06-26 08:22:21
4        2014-08-28 17:47:00
                 ...        
199995   2012-10-28 10:49:00
199996   2014-03-14 01:09:00
199997   2009-06-29 00:42:00
199998   2015-05-20 14:56:25
199999   2010-05-15 04:08:00
Name: pickup_datetime, Length: 200000, dtype: datetime64[ns]

#drop null values

In [17]:
df.dropna(inplace=True)

#checking null values

In [18]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

#  calculate average fare amount

In [19]:
avg_fare = df['fare_amount'].mean()
print("average fare amount:", avg_fare)


average fare amount: 11.359891549457748


# Calculate distance between each pickup and dropoff points using Haversine formula.
#the median haversine distance between pickup and dropoff location according to the given dataset

In [20]:
def haversine(lat1, lon1, lat2, lon2):
    #conerting latitude and longitude degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    # haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    #radius of earth in km
    r = 6371.0
    distance = r * c
    return distance


In [21]:
df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


In [22]:
median_haversine_distance = df['haversine_distance'].median()
print("median haversine distance:", median_haversine_distance, "kilometers")


median haversine distance: 2.120992396182902 kilometers


# the maximum haversine distance between pickup and dropoff location according to the given dataset




In [23]:
max_haversine_distance = df['haversine_distance'].max()
print("maximum haversine distance:", max_haversine_distance, "kilometers")


maximum haversine distance: 16409.23913531317 kilometers


# How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset

In [24]:
zero_distance_rides = (df['haversine_distance'] == 0.0).sum()
print("number of rides with 0.0 Haversine distance:", zero_distance_rides)


number of rides with 0.0 Haversine distance: 5632


# mean 'fare_amount' for rides with 0 haversine distance


In [25]:
mean_fare_amount_zero_distance = df[df['haversine_distance'] == 0.0]['fare_amount'].mean()
print("mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)


mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


# maximum 'fare_amount' for a ride


In [26]:
max_fare_amount = df['fare_amount'].max()
print("maximum 'fare_amount' for a ride:", max_fare_amount)


maximum 'fare_amount' for a ride: 499.0


# haversine distance between pickup and dropoff location for the costliest ride

In [27]:
# the highest 'fare_amount
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

# haversine distance for the costliest ride
costliest_ride_distance = costliest_ride['haversine_distance'].values[0]

print("haversine distance for the costliest ride:", costliest_ride_distance, "kilometers")


haversine distance for the costliest ride: 0.0007899213191009993 kilometers


# How many rides were recorded in the year 2014


In [28]:
# convert the 'pickup_datetime' to a datetime data type
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# extract the year from the 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year

# number of rides in the year 2014
rides_in_2014 = len(df[df['pickup_year'] == 2014])

print("number of rides recorded in the year 2014:", rides_in_2014)


number of rides recorded in the year 2014: 29968


# How many rides were recorded in the first quarter of 2014

In [29]:
# extract the year and quarter from the 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter
# count the number of rides in the first quarter of 2014
rides_in_Q1_2014 = len(df[(df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)])

print("number of rides recorded in the first quarter of 2014:", rides_in_Q1_2014)


number of rides recorded in the first quarter of 2014: 7687


# On which day of the week in September 2010, maximum rides were recorded

In [30]:
# extract the day of the week from the 'pickup_datetime'
df['day_of_week'] = df['pickup_datetime'].dt.day_name()

# for rides in september 2010
september_2010_rides = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)]

# day of the week with the maximum rides in September 2010
max_rides_day = september_2010_rides['day_of_week'].mode()[0]

print("day of the week with the maximum rides in september 2010:", max_rides_day)


day of the week with the maximum rides in september 2010: Thursday


In [31]:
#relevant features from 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_second'] = df['pickup_datetime'].dt.second



In [34]:
y = df['fare_amount']

X = df[['passenger_count', 'haversine_distance', 'day_of_week']]

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [36]:
X_train_categorical = X_train.select_dtypes(include=['object'])
X_train_categorical.head()

Unnamed: 0,day_of_week
195896,Saturday
47114,Tuesday
144876,Thursday
184791,Sunday
123270,Tuesday


In [37]:
X_train_numerical = X_train.select_dtypes(include=['float64','int64'])
X_train_numerical.head()


Unnamed: 0,passenger_count,haversine_distance
195896,3,2.034021
47114,2,3.987687
144876,2,1.31358
184791,2,0.780877
123270,1,0.805125


In [38]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_numerical_rescaled = pd.DataFrame(scaler.fit_transform(X_train_numerical),
                                         columns= X_train_numerical.columns,
                                       index = X_train_numerical.index)
X_train_numerical_rescaled.head()

Unnamed: 0,passenger_count,haversine_distance
195896,1.008439,-0.049696
47114,0.242972,-0.044699
144876,0.242972,-0.051539
184791,0.242972,-0.052902
123270,-0.522495,-0.05284


In [39]:
X_train_categorical_LabelEnc = pd.DataFrame(index=X_train_categorical.index)

X_train_categorical_LabelEnc.head()

195896
47114
144876
184791
123270


In [41]:
X_train_categorical.day_of_week.unique()

array(['Saturday', 'Tuesday', 'Thursday', 'Sunday', 'Monday', 'Friday',
       'Wednesday'], dtype=object)

In [44]:
day_encoder = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday':7 }

X_train_categorical_LabelEnc['day_of_week'] = X_train_categorical['day_of_week'].apply(lambda x: day_encoder[x])

X_train_categorical_LabelEnc.head()

Unnamed: 0,day_of_week
195896,6
47114,2
144876,4
184791,7
123270,2


In [45]:
# Concatinating the Encoded Categorical Features and Rescaled Numerical Features
X_train_transformed = pd.concat([X_train_numerical_rescaled, X_train_categorical_LabelEnc], axis=1)

X_train_transformed.head()

Unnamed: 0,passenger_count,haversine_distance,day_of_week
195896,1.008439,-0.049696,6
47114,0.242972,-0.044699,2
144876,0.242972,-0.051539,4
184791,0.242972,-0.052902,7
123270,-0.522495,-0.05284,2


In [46]:
X_test_categorical = X_test.select_dtypes(include=['object'])

X_test_categorical.head()

Unnamed: 0,day_of_week
180481,Sunday
8971,Sunday
77027,Saturday
21656,Thursday
58303,Tuesday


In [47]:
X_test_numerical = X_test.select_dtypes(include=['int64', 'float64'])

X_test_numerical.head()

Unnamed: 0,passenger_count,haversine_distance
180481,1,0.996834
8971,1,2.537148
77027,5,3.716854
21656,3,4.264505
58303,1,3.235923


In [48]:
X_test_numerical_rescaled = pd.DataFrame(scaler.transform(X_test_numerical),
                                        columns= X_test_numerical.columns,
                                        index = X_test_numerical.index)

X_test_numerical_rescaled.head()

Unnamed: 0,passenger_count,haversine_distance
180481,-0.522495,-0.052349
8971,-0.522495,-0.04841
77027,2.539373,-0.045392
21656,1.008439,-0.043991
58303,-0.522495,-0.046622


In [49]:
X_test_categorical_LabelEnc= pd.DataFrame(index = X_test_categorical.index)

X_test_categorical_LabelEnc.head()

180481
8971
77027
21656
58303


In [51]:
X_test_categorical_LabelEnc['day_of_week'] = X_test_categorical['day_of_week'].apply(lambda x : day_encoder[x])
X_test_categorical_LabelEnc.head()

Unnamed: 0,day_of_week
180481,7
8971,7
77027,6
21656,4
58303,2


In [52]:
X_test_transformed = pd.concat([X_test_numerical_rescaled, X_test_categorical_LabelEnc], axis=1)

X_test_transformed.head()

Unnamed: 0,passenger_count,haversine_distance,day_of_week
180481,-0.522495,-0.052349,7
8971,-0.522495,-0.04841,7
77027,2.539373,-0.045392,6
21656,1.008439,-0.043991,4
58303,-0.522495,-0.046622,2


In [53]:
from sklearn.linear_model import LinearRegression
lr_reg = LinearRegression()

lr_reg.fit(X_train_transformed,y_train)

In [54]:
y_test_predicted = lr_reg.predict(X_test_transformed)

In [55]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_predicted})

temp_df.head()

Unnamed: 0,Actual,Predicted
180481,4.5,11.346313
8971,15.0,11.347314
77027,12.1,11.658653
21656,14.1,11.443661
58303,10.9,11.227645


In [56]:
from sklearn import metrics
r2 = metrics.r2_score(y_test, y_test_predicted)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.0005416156910350578


In [57]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor()

dt_reg.fit(X_train_transformed,y_train)

In [58]:
y_test_predicted = dt_reg.predict(X_test_transformed)

In [59]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_predicted})

temp_df.head()

Unnamed: 0,Actual,Predicted
180481,4.5,4.5
8971,15.0,8.5
77027,12.1,9.3
21656,14.1,11.4
58303,10.9,7.7


In [60]:
from sklearn import metrics
r2 = metrics.r2_score(y_test, y_test_predicted)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.5024340651891441


In [61]:
from sklearn.ensemble import RandomForestRegressor


In [63]:

rf_reg = RandomForestRegressor()

rf_reg.fit(X_train_transformed,y_train)

In [64]:
y_test_predicted = rf_reg.predict(X_test_transformed)

In [65]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_predicted})

temp_df.head()

Unnamed: 0,Actual,Predicted
180481,4.5,4.749
8971,15.0,8.435
77027,12.1,11.274
21656,14.1,16.686
58303,10.9,12.227


In [66]:
from sklearn import metrics
r2 = metrics.r2_score(y_test, y_test_predicted)
n = len(y_test)
k = X_test.shape[1]

r2_adj = 1 - (1-r2)*(n-1)/(n-k-1)

print(r2_adj)

0.6431047293757356


In [69]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
adjusted_r2_rf = 0.6431047293757356
adjusted_r2_lr = 0.0005416156910350578
models = ['Random Forest', 'Linear Regression']
adjusted_r2_values = [adjusted_r2_rf, adjusted_r2_lr]

traces = [
    go.Bar(
        x=models,
        y=adjusted_r2_values,
        marker=dict(color=['blue', 'green'])
    )
]
fig = make_subplots(rows=1, cols=1)

for trace in traces:
    fig.add_trace(trace)
fig.update_layout(
    title='Adjusted R-squared Values for Different Models',
    xaxis=dict(title='Model'),
    yaxis=dict(title='Adjusted R-squared'),
    showlegend=False
)
fig.show()


#without using label encoding and StandardScaler and consider week days as monday to friday

In [None]:
# one-hot encoding for 'ride_week_day'
df = pd.get_dummies(df, columns=['day_of_week'], prefix='day')

# define X and y
X = df[['passenger_count', 'haversine_distance', 'day_Monday', 'day_Tuesday', 'day_Wednesday', 'day_Thursday', 'day_Friday']]
y = df['fare_amount']

In [None]:
df.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'haversine_distance', 'pickup_year',
       'pickup_quarter', 'pickup_month', 'pickup_day', 'pickup_hour',
       'pickup_minute', 'pickup_second', 'day_Friday', 'day_Monday',
       'day_Saturday', 'day_Sunday', 'day_Thursday', 'day_Tuesday',
       'day_Wednesday'],
      dtype='object')

# Apply a Machine Learning Algorithm to predict the fare amount given following input features:
# passenger_count, distance and ride_week_day.

# Perform a 70-30 split of data.

# Which algorithm gives the least adjusted R square value?

In [None]:

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize and train regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # calculate adjusted R-squared
    n = len(X_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2

# the model with the least adjusted R-squared
least_adjusted_r2_model = min(adjusted_r2_scores, key=adjusted_r2_scores.get)

print("model with the least adjusted R-squared:", least_adjusted_r2_model)


Model with the least adjusted R-squared: Linear Regression


In [None]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize and train regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regression': KNeighborsRegressor()
}

adjusted_r2_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # calculate adjusted R-squared
    n = len(X_test)
    p = X_test.shape[1]
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    adjusted_r2_scores[model_name] = adjusted_r2

    print(f"{model_name}:")
    print(f"R-squared: {r2:.2f}")
    print(f"Adjusted R-squared: {adjusted_r2:.2f}")
    print()


Linear Regression:
R-squared: 0.00
Adjusted R-squared: 0.00

Decision Tree Regression:
R-squared: 0.48
Adjusted R-squared: 0.48

Random Forest Regression:
R-squared: 0.63
Adjusted R-squared: 0.63

KNN Regression:
R-squared: 0.63
Adjusted R-squared: 0.63



In [None]:
0.00<0.48<0.62<0.63

True