In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# Load the dataset
df = pd.read_csv('uber.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [6]:
# Preprocess the dataset
# Remove any rows with missing values
df.dropna(inplace=True)

In [7]:
# Identify and remove outliers
# For example, we can use the following code to remove any rides with a fare amount greater than $100
df = df[df['fare_amount'] < 100]

In [8]:
# Check the correlation between the features
# For example, we can use the following code to generate a correlation matrix
corr_matrix = df.corr()
print(corr_matrix)

                   Unnamed: 0  fare_amount  pickup_longitude  pickup_latitude  \
Unnamed: 0           1.000000     0.000045          0.000289        -0.000387   
fare_amount          0.000045     1.000000          0.007301        -0.005944   
pickup_longitude     0.000289     0.007301          1.000000        -0.816269   
pickup_latitude     -0.000387    -0.005944         -0.816269         1.000000   
dropoff_longitude    0.000322     0.006246          0.832830        -0.774562   
dropoff_latitude     0.000219    -0.008269         -0.846126         0.702054   
passenger_count      0.002252     0.012056         -0.000432        -0.001546   

                   dropoff_longitude  dropoff_latitude  passenger_count  
Unnamed: 0                  0.000322          0.000219         0.002252  
fare_amount                 0.006246         -0.008269         0.012056  
pickup_longitude            0.832830         -0.846126        -0.000432  
pickup_latitude            -0.774562          0.702054 

  corr_matrix = df.corr()


In [9]:
# Select the most important features
# For example, we can select the features that have a correlation of greater than 0.5 with the target variable
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [10]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['fare_amount'], test_size=0.25, random_state=42)

In [11]:
# Train the linear regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

In [12]:
# Train the random forest regression model
random_forest_regression_model = RandomForestRegressor()
random_forest_regression_model.fit(X_train, y_train)

In [13]:
# Make predictions on the test set
linear_regression_predictions = linear_regression_model.predict(X_test)
random_forest_regression_predictions = random_forest_regression_model.predict(X_test)

In [14]:
# Evaluate the models
linear_regression_r2 = r2_score(y_test, linear_regression_predictions)
linear_regression_rmse = mean_squared_error(y_test, linear_regression_predictions)**0.5

In [15]:
random_forest_regression_r2 = r2_score(y_test, random_forest_regression_predictions)
random_forest_regression_rmse = mean_squared_error(y_test, random_forest_regression_predictions)**0.5

In [16]:
# Print the results
print('Linear regression:')
print('R2:', linear_regression_r2)
print('RMSE:', linear_regression_rmse)

Linear regression:
R2: -6.819297748106301e-05
RMSE: 9.343839813756933


In [17]:
print('Random forest regression:')
print('R2:', random_forest_regression_r2)
print('RMSE:', random_forest_regression_rmse)

Random forest regression:
R2: 0.8072786162466434
RMSE: 4.101809907281002
