In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Importing Data

In [None]:
df = pd.read_csv('uber.csv', index_col=0)
df.head()

Dropping the Unnamed column

In [None]:
# Drop Null Values
df.isnull().sum()
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

### Parsing Date and Time

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extract features: hour, day_of_week, month
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
df['month'] = df['pickup_datetime'].dt.month

### Checking for invalid fare prices

In [None]:
plt.figure(figsize=(5,4))
plt.scatter(df.index, df['fare_amount'], alpha=0.5)
plt.xlabel('Index')
plt.ylabel('Fare Amount')
plt.xticks(rotation = 25)
plt.title('Detecting Outliers in Fare Amount')
plt.show()

In [None]:
df = df[df['fare_amount']>0]
df = df.reset_index(drop=True)
df.head()

### Checking for invalid latitude and longitude

In [None]:
invalid_pickup_longitude = (-180 > df['pickup_longitude']).sum() + (180 < df['pickup_longitude']).sum()
invalid_pickup_latitude = (-90 > df['pickup_latitude']).sum() + (90 < df['pickup_latitude']).sum()
invalid_dropoff_longitude = (-180 > df['dropoff_longitude']).sum() + (180 < df['dropoff_longitude']).sum()
invalid_dropoff_latitude = (-90 > df['dropoff_latitude']).sum() + (90 < df['dropoff_latitude']).sum()
print(f"Invalid Pickup Longitude: {invalid_pickup_longitude}")
print(f"Invalid Pickup Latitude: {invalid_pickup_latitude}")
print(f"Invalid Dropoff Longitude: {invalid_dropoff_longitude}")
print(f"Invalid Dropoff Latitude: {invalid_dropoff_latitude}")

In [None]:
df = df[df['pickup_longitude'] < 180]
df = df[df['pickup_longitude'] > -180]
df = df[df['pickup_latitude'] < 90]
df = df[df['pickup_latitude'] > -90]
df = df[df['dropoff_longitude'] < 180]
df = df[df['dropoff_longitude'] > -180]
df = df[df['dropoff_latitude'] < 90]
df = df[df['dropoff_latitude'] > -90]

### Calculating Distance using Haversine Formulae

In [None]:
# Using Haversine Formulae
def coordinate_2_distance(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c
    return km

In [None]:
df['distance_in_km'] = coordinate_2_distance(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [None]:
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

### Creating Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split
# Features and target variable
features = ['distance_in_km', 'hour', 'day_of_week', 'month', 'is_weekend']
X = df[features]
y = df['fare_amount']

# Train-Test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training and Testing Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=200, random_state=None)
model.fit(X_train, y_train)

y_pred_rf = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
# Calculate RMSE
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

# Calculate R-squared (R2)
r2_rf = r2_score(y_test, y_pred_rf)

# Print evaluation metrics
print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest R2: {r2_rf}")