In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
0,748.0,2.75,1.0,75.0,24,6.3,105.3,0
1,1187.0,3.43,1.0,105.0,24,13.2,142.2,0
2,730.0,3.12,1.0,71.25,0,26.625,97.875,1
3,671.0,5.63,3.0,90.0,0,9.75,99.75,0
4,329.0,2.09,1.0,45.0,12,13.2,70.2,0


In [4]:
train.drop_duplicates(inplace=True)

In [5]:
train.describe()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
count,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0
mean,1189.102226,5.123167,1.298654,100.656568,13.24756,15.294384,129.198512,0.285803
std,4824.30216,126.528223,0.939258,86.142902,20.507879,12.621609,99.283531,0.451797
min,0.0,0.02,0.0,0.0,0.0,-0.5,0.0,0.0
25%,454.0,1.98,1.0,52.5,0.0,6.0,73.125,0.0
50%,716.0,3.25,1.0,78.75,9.0,9.75,103.5,0.0
75%,1110.0,5.81,1.0,116.25,20.0,26.525,153.45,1.0
max,86387.0,57283.91,9.0,4466.25,2500.0,435.0,4472.25,1.0


In [6]:
train.shape

(205348, 8)

In [7]:
train.drop(train[train["trip_duration"]<= 0].index, inplace=True)

In [8]:
train.drop(train[train["distance_traveled"]<= 0].index, inplace=True)

In [9]:
train.drop(train[train["num_of_passengers"]<= 0].index, inplace=True)

In [10]:
train.shape

(204564, 8)

In [11]:
train.describe()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
count,204564.0,204564.0,204564.0,204564.0,204564.0,204564.0,204564.0,204564.0
mean,1190.750484,5.127173,1.30345,100.682715,13.244012,15.28883,129.215557,0.285515
std,4833.326135,126.770049,0.937736,86.115812,20.494954,12.616267,99.232152,0.45166
min,1.0,0.02,1.0,0.0,0.0,-0.5,0.0,0.0
25%,454.0,2.0,1.0,52.5,0.0,6.0,73.125,0.0
50%,717.0,3.25,1.0,78.75,9.0,9.75,103.5,0.0
75%,1110.0,5.81,1.0,116.25,20.0,26.525,153.45,1.0
max,86387.0,57283.91,9.0,4466.25,2500.0,435.0,4472.25,1.0


In [12]:
y = train["fare"]
train.drop("fare", axis=1, inplace=True)

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.3)

In [14]:
X_train_extra = X_train[["tip", "miscellaneous_fees"]]

In [15]:
X_train = X_train[["trip_duration", "distance_traveled", "num_of_passengers", "surge_applied"]]

In [16]:
X_valid_extra = X_valid[["tip", "miscellaneous_fees"]]

In [17]:
X_valid = X_valid[["trip_duration", "distance_traveled", "num_of_passengers", "surge_applied"]]

In [18]:
from scipy.stats import spearmanr

p_val = []
corr = []

for i in X_train.columns:
    
    correlation, p_value = spearmanr(X_train[i], y_train)
    p_val.append(p_value)
    corr.append(correlation)

In [19]:
p_val, corr

([0.0, 0.0, 8.50940914691462e-16, 0.0],
 [0.9087328384260042,
  0.9185716996791125,
  0.021262149386232554,
  0.29758188780035744])

In [20]:
from scipy.stats import kendalltau

p_val = []
corr = []

for i in X_train.columns:
    
    correlation, p_value = kendalltau(X_train[i], y_train)
    p_val.append(p_value)
    corr.append(correlation)

In [21]:
p_val, corr

([0.0, 0.0, 7.884659457162761e-16, 0.0],
 [0.803499582172533,
  0.8081054044816852,
  0.017236743413046388,
  0.24615776215346732])

In [22]:
from scipy.stats import pearsonr

p_val = []
corr = []

for i in X_train.columns:
    
    correlation, p_value = pearsonr(X_train[i], y_train)
    p_val.append(p_value)
    corr.append(correlation)

In [23]:
p_val, corr

([0.0, 4.088585783852782e-30, 0.00027078989446878383, 0.0],
 [0.14695974527932723,
  0.030124644758085087,
  0.009623688928629984,
  0.14343747111547273])

# Scaling the data

In [24]:
scale = MinMaxScaler()
scale.fit(X_train)
X_train_scale = scale.transform(X_train)

In [25]:
X_valid_scale = scale.transform(X_valid)

In [26]:
out_scale = MinMaxScaler()
out_scale.fit(np.array(y_train).reshape(-1,1))
y_train_scale = out_scale.transform(np.array(y_train).reshape(-1,1))

In [27]:
y_valid_scale = out_scale.transform(np.array(y_valid).reshape(-1,1))

In [28]:
X_train_df = pd.DataFrame(X_train_scale, columns=scale.get_feature_names_out())
X_train_df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,surge_applied
0,0.009052,0.000104,0.0,1.0
1,0.025629,8.3e-05,0.125,1.0
2,0.017399,0.00021,0.0,0.0
3,0.006494,5.2e-05,0.0,1.0
4,0.010453,9.7e-05,0.0,0.0


In [29]:
X_valid_df = pd.DataFrame(X_valid_scale, columns=scale.get_feature_names_out())
X_valid_df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,surge_applied
0,0.016646,8.4e-05,0.125,0.0
1,0.013324,7.8e-05,0.0,0.0
2,0.00536,0.000106,0.0,0.0
3,0.012004,8.6e-05,0.0,0.0
4,0.006714,3.9e-05,0.0,0.0


In [30]:
y_train_df = pd.DataFrame(y_train_scale, columns=['fare'])
y_train_df.head()

Unnamed: 0,fare
0,0.02183
1,0.039463
2,0.040302
3,0.014274
4,0.02351


In [31]:
y_valid_df = pd.DataFrame(y_valid_scale, columns=['fare'])
y_valid_df.head()

Unnamed: 0,fare
0,0.026868
1,0.02183
2,0.020991
3,0.02351
4,0.013434


# Building a Regression Model

In [39]:
model_1 = XGBRegressor()
model_1.fit(X_train_df[["trip_duration"]], y_train_df["fare"])
model_1_pred = model_1.predict(X_valid_df[["trip_duration"]])
model_1_mse = mean_squared_error(y_valid_scale, model_1_pred)
model_1_mse

0.00021578014688373472

In [40]:
model_2 = XGBRegressor()
model_2.fit(X_train_df[["distance_traveled"]], y_train_df["fare"])
model_2_pred = model_2.predict(X_valid_df[["distance_traveled"]])
model_2_mse = mean_squared_error(y_valid_scale, model_2_pred)
model_2_mse

0.0001282520323012554

In [41]:
model_3 = XGBRegressor()
model_3.fit(X_train_df[["num_of_passengers"]], y_train_df["fare"])
model_3_pred = model_3.predict(X_valid_df[["num_of_passengers"]])
model_3_mse = mean_squared_error(y_valid_scale, model_3_pred)
model_3_mse

0.00038458797772346234

In [42]:
model_4 = XGBRegressor()
model_4.fit(X_train_df[["surge_applied"]], y_train_df["fare"])
model_4_pred = model_4.predict(X_valid_df[["surge_applied"]])
model_4_mse = mean_squared_error(y_valid_scale, model_4_pred)
model_4_mse

0.00037873411413975693

In [43]:
model_5 = XGBRegressor()
model_5.fit(X_train_df[["distance_traveled", "trip_duration"]], y_train_df["fare"])
model_5_pred = model_5.predict(X_valid_df[["distance_traveled", "trip_duration"]])
model_5_mse = mean_squared_error(y_valid_scale, model_5_pred)
model_5_mse

9.860604889873457e-05

In [48]:
test_data = test[["distance_traveled", "trip_duration", "num_of_passengers", "surge_applied"]]

In [49]:
train.describe()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,tip,miscellaneous_fees,total_fare,surge_applied
count,204564.0,204564.0,204564.0,204564.0,204564.0,204564.0,204564.0
mean,1190.750484,5.127173,1.30345,13.244012,15.28883,129.215557,0.285515
std,4833.326135,126.770049,0.937736,20.494954,12.616267,99.232152,0.45166
min,1.0,0.02,1.0,0.0,-0.5,0.0,0.0
25%,454.0,2.0,1.0,0.0,6.0,73.125,0.0
50%,717.0,3.25,1.0,9.0,9.75,103.5,0.0
75%,1110.0,5.81,1.0,20.0,26.525,153.45,1.0
max,86387.0,57283.91,9.0,2500.0,435.0,4472.25,1.0


In [50]:
test_data["distance_traveled"] = np.where(test_data["distance_traveled"]<=0, 0.02, test_data["distance_traveled"])
test_data["distance_traveled"] = np.where(test_data["distance_traveled"]>=57283.91, 57283.91, test_data["distance_traveled"])

In [51]:
test_data["trip_duration"] = np.where(test_data["trip_duration"]<=0, 1.0, test_data["trip_duration"])
test_data["trip_duration"] = np.where(test_data["trip_duration"]>=86387, 86387, test_data["trip_duration"])

In [52]:
test_data["num_of_passengers"] = np.where(test_data["num_of_passengers"]<=0, 1.0, test_data["num_of_passengers"])
test_data["num_of_passengers"] = np.where(test_data["num_of_passengers"]>=9, 9, test_data["num_of_passengers"])

In [53]:
test_data.describe()

Unnamed: 0,distance_traveled,trip_duration,num_of_passengers,surge_applied
count,89861.0,89861.0,89861.0,89861.0
mean,6.021861,1148.178164,1.290916,0.28228
std,270.269742,4611.478365,0.917319,0.450112
min,0.02,1.0,1.0,0.0
25%,1.95,446.0,1.0,0.0
50%,3.2,705.0,1.0,0.0
75%,5.73,1094.0,1.0,1.0
max,57283.91,86387.0,9.0,1.0


In [54]:
X_test_scale = scale.transform(test_data)
X_test_scale_df = pd.DataFrame(X_test_scale, columns=scale.get_feature_names_out())
X_test_scale_df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,surge_applied
0,3.7e-05,0.018783,0.0,0.0
1,6e-06,0.007489,0.375,0.0
2,3.6e-05,0.014943,0.0,0.0
3,2.6e-05,0.010858,0.0,0.0
4,3.4e-05,0.00885,0.0,0.0


In [56]:
test_pred = model_5.predict(X_test_scale_df[["distance_traveled", "trip_duration"]])

In [57]:
test_pred

array([0.02765028, 0.19965778, 0.02765028, ..., 0.23280618, 0.02765028,
       0.23640063], dtype=float32)

In [60]:
test_pred = out_scale.inverse_transform(np.reshape(test_pred, (-1,1)))
test_pred

array([[ 123.49306],
       [ 891.72156],
       [ 123.49306],
       ...,
       [1039.7706 ],
       [ 123.49306],
       [1055.8243 ]], dtype=float32)

In [61]:
test_pred = np.squeeze(test_pred)

In [70]:
total_fare = np.abs(test_pred) + np.array(test["tip"]) + np.array(test["miscellaneous_fees"])
total_fare

array([ 136.99305725,  905.22155762,  153.49305725, ..., 1066.39562988,
        178.69305725, 1084.99934082])

In [71]:
total_fare_df = pd.DataFrame(total_fare, columns=["total_fare"])

In [73]:
total_fare_df.to_csv("total_fare_1.csv", index=False)