In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
#We do not want to see warnings
warnings.filterwarnings("ignore") 

In [None]:
#import data
data = pd.read_csv("uber.csv")

In [None]:
#Create a data copy
df = data.copy()

In [None]:
#Print data
df.head()

In [None]:
#Get Info
df.info()

In [None]:
#pickup_datetime is not in required data format
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [None]:
df.info()

In [None]:
#Statistics of data
df.describe()

In [None]:
#Number of missing values
df.isnull().sum()

In [None]:
#Correlation
df.select_dtypes(include=[np.number]).corr()

In [None]:
print(df.columns)

In [None]:
#Drop the rows with missing values
df.dropna(inplace=True)

In [None]:
plt.boxplot(df['fare_amount'])

In [None]:
#Remove Outliers
q_low = df["fare_amount"].quantile(0.01)
q_hi  = df["fare_amount"].quantile(0.99)

df = df[(df["fare_amount"] < q_hi) & (df["fare_amount"] > q_low)]

In [None]:
#Check the missing values now
df.isnull().sum()

In [None]:
#Time to apply learning models
from sklearn.model_selection import train_test_split

In [None]:
#Take x as predictor variable
x = df.drop("fare_amount", axis = 1)
#And y as target variable
y = df['fare_amount']


In [None]:
#Necessary to apply model
x['pickup_datetime'] = pd.to_numeric(pd.to_datetime(x['pickup_datetime']))
x = x.loc[:, x.columns.str.contains('^Unnamed')]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)

In [None]:
#Prediction
predict = lrmodel.predict(x_test)

In [None]:
# evaluation

from sklearn.metrics import mean_squared_error, r2_score

lr_rmse = np.sqrt(mean_squared_error(y_test, predict))
lr_r2 = r2_score(y_test, predict)

print("Linear Regression → RMSE:", lr_rmse, "R²:", lr_r2)

In [None]:
#Let's Apply Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfrmodel = RandomForestRegressor(n_estimators = 100, random_state = 101)

In [None]:
#Fit the Forest
rfrmodel.fit(x_train, y_train)
rfrmodel_pred = rfrmodel.predict(x_test)

In [None]:
rfr_rmse = np.sqrt(mean_squared_error(y_test, rfrmodel_pred))
rfr_r2 = r2_score(y_test, rfrmodel_pred)

print("Random Forest → RMSE:", rfr_rmse, "R²:", rfr_r2)