In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('./datasets/uber.csv')
df.head()

In [None]:
df.shape

**Preprocessing the dataset**

In [None]:
df.info()

In [None]:
len(df['key'].unique())

In [None]:
# Dropping redundant columns
df.drop(columns=['Unnamed: 0','key'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).astype(int) / 10**9
df.head(10)

In [None]:
df.info()

**Handling Outliers**

In [None]:
for col in df.columns:
    plt.figure(figsize=(5,5))
    plt.boxplot(x=df[col])
    plt.title(col)

In [None]:
# Handling outliers in longitudes and latitudes

df = df[
    (df['pickup_latitude'] <= 90) & (df['dropoff_latitude'] <= 90) & 
    (df['pickup_latitude'] >= -90) & (df['dropoff_latitude'] >= -90) &
    (df['pickup_longitude'] <= 180) & (df['dropoff_longitude'] <= 180) &
    (df['pickup_longitude'] >= -180) & (df['dropoff_longitude'] >= -180)
]

df.shape

**Correlation**

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

**Calculating distance parameter**

In [None]:
def calc_dist(lat_1, lat_2, lon_1, lon_2):
    lat_1,lat_2,lon_1,lon_2 = map(np.radians, [lat_1,lat_2,lon_1,lon_2])
    diff_lat = lat_2 - lat_1
    diff_lon = lon_2 - lon_1
    
    dist = 2 * 6371 * np.arcsin(np.sqrt(np.sin(diff_lat/2)**2 + np.cos(lat_1)*np.cos(lat_2)*np.sin(diff_lon/2)**2))

    return dist

In [None]:
df['Distance'] = [calc_dist(
                    df['dropoff_latitude'][i],
                    df['pickup_latitude'][i],
                    df['dropoff_longitude'][i],
                    df['pickup_longitude'][i])
                    for i in df.index
                 ]
df.head(10)

In [None]:
sns.boxplot(x=df['Distance'])

In [None]:
# Removing distance outliers

q1 = np.percentile(df['Distance'],25)
q3 = np.percentile(df['Distance'],75)

iqr = q3-q1 
upper_limit = q3 + 1.5*iqr
lower_limit = q1 - 1.5*iqr

df = df[(df['Distance'] < upper_limit) & (df['Distance'] > lower_limit)]

df.shape

**Regression Models**

In [None]:
X = df[['pickup_datetime','passenger_count','Distance']]
y = df.iloc[:,0]

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

In [None]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [None]:
# X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)
# X_test = X_test.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
from sklearn import metrics

print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2 Score: ", metrics.r2_score(y_test, y_pred))

**Random Forest Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=100, random_state=2)

In [None]:
rfr.fit(X_train, y_train)

In [None]:
y_pred_rfr = rfr.predict(X_test)
y_pred_rfr

In [None]:
from sklearn import metrics

print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, y_pred_rfr))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test, y_pred_rfr))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfr)))
print("R2 Score: ", metrics.r2_score(y_test, y_pred_rfr))

In [None]:
# Hyparameter Tuning

r2Scores = []
for i in range(1,31):
    rfr = RandomForestRegressor(n_estimators=i, random_state=2)
    rfr.fit(X_train, y_train)
    y_pred_rfr = rfr.predict(X_test)
    r2Scores.append(metrics.r2_score(y_test, y_pred_rfr))

In [None]:
plt.figure(figsize=(10,8))
plt.plot(range(1,31), r2Scores)
plt.xlabel("Decision Trees")
plt.ylabel("Accuracy")
plt.title("Elbow Plot - Random Forest Regression")

In [None]:
X_train

In [None]:
plt.scatter(X_train, y_train)
plt.plot(X_train.iloc[:,-1], rfr.predict(X_train), color='red')
plt.xlabel('Distance')
plt.ylabel('Fare Amount')

**Testing without using distance**

In [None]:
df.head()

In [None]:
X2 = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','normalized_distance','normalized_dates']]
y2 = df['fare_amount']
X2

In [None]:
y2

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=2)

In [None]:
lr2 = LinearRegression()
lr2.fit(X2_train, y2_train)

In [None]:
y2_pred = lr2.predict(X2_test)
y2_pred

In [None]:
from sklearn import metrics

print("Mean Absolute Error: ", metrics.mean_absolute_error(y2_test, y2_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y2_test, y2_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y2_test, y2_pred)))
print("R2 Score: ", metrics.r2_score(y2_test, y2_pred))

In [None]:
rfr2 = RandomForestRegressor(n_estimators=100,random_state=2)
rfr2.fit(X2_train, y2_train)

In [None]:
y2_pred_rfr = rfr2.predict(X2_test)

In [None]:
from sklearn import metrics

print("Mean Absolute Error: ", metrics.mean_absolute_error(y2_test, y2_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y2_test, y2_pred))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y2_test, y2_pred)))
print("R2 Score: ", metrics.r2_score(y2_test, y2_pred))