In [1]:
cd Downloads\\cab_fare_prediction

C:\Users\apurb\Downloads\cab_fare_prediction


In [2]:
# Import all the libraries
import numpy as np
import pandas as pd
from sklearn import *
import seaborn as sns
from xgboost import *
from sklearn.model_selection import train_test_split

In [3]:
# Import train and test dataset
train = pd.read_csv("TRAIN.csv")
test = pd.read_csv("TEST.csv")

In [4]:
# Combine both the dataset
df = pd.concat([train,test]).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,index,time_stamp,cab_provider,source,destination,distance,surge_multiplier,cab_type,fare
0,0,1543203646318,Lyft,Boston University,Theatre District,3.03,1.0,Lux Black XL,34.0
1,1,1543203646319,Uber,South Station,Theatre District,1.3,1.0,Black,18.5
2,2,1543203646320,Uber,Theatre District,Fenway,2.71,1.0,UberX,19.5
3,3,1543203646320,Lyft,Northeastern University,Beacon Hill,2.43,1.0,Lyft,10.5
4,4,1543203646320,Uber,Theatre District,Fenway,2.71,1.0,UberXL,32.0


In [6]:
# Dropping the index column
df.drop("index", axis=1, inplace=True)

In [7]:
# Checking the number of null values per column
df.isnull().sum()

time_stamp              0
cab_provider            0
source                  0
destination             0
distance                0
surge_multiplier        0
cab_type                0
fare                25000
dtype: int64

In [8]:
# Converting the time_stamp column into timestamp datatype
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='ms')

In [9]:
# Checkng the datatime column
df['time_stamp'].describe(datetime_is_numeric=True)

count                           125000
mean     2018-11-27 11:44:37.038046464
min         2018-11-26 03:40:46.318000
25%      2018-11-27 00:36:14.387000064
50%      2018-11-27 11:15:22.329999872
75%      2018-11-27 22:06:23.409999872
max         2018-11-28 14:01:23.687000
Name: time_stamp, dtype: object

In [10]:
# Returns the hour of the current time e.g. for 6:30 it will return 6
def hour_convert(time):
    return time.time().hour

In [11]:
# Converting the time_stamp with above function
df['time_stamp'] = list(map(hour_convert, df['time_stamp']))

In [12]:
# Change the data into 4 intervals
def interval_convert(time):
    if  0 <= time <= 6:
        return 'midnight'
    elif 7 <= time <=12:
        return 'morning'
    elif 13<= time <=19:
        return 'mid-day'
    else:
        return 'Night'

In [13]:
df['time_stamp'] = list(map(interval_convert, df['time_stamp']))

In [14]:
# One-Hot-Encoding the new time_stamp column
df = pd.get_dummies(df, columns=['time_stamp'])

In [15]:
label_encoder = preprocessing.LabelEncoder()

In [16]:
df.cab_provider.unique()

array(['Lyft', 'Uber'], dtype=object)

In [17]:
# Label encoding the cab column
df.cab_provider = df.cab_provider.replace({"Lyft":0, "Uber":1})

In [18]:
df.groupby('cab_provider')['surge_multiplier'].value_counts()

cab_provider  surge_multiplier
0             1.00                56075
              1.25                 2168
              1.50                  974
              1.75                  500
              2.00                  426
              2.50                   31
              3.00                    7
1             1.00                64819
Name: surge_multiplier, dtype: int64

In [19]:
# Aggregating fare and distance per cab_type
df.groupby(['cab_provider','cab_type']).agg({'fare':['sum'], 'distance':['sum']})

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,distance
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,sum
cab_provider,cab_type,Unnamed: 2_level_2,Unnamed: 3_level_2
0,Lux,142537.5,21692.49
0,Lux Black,182295.85,21594.48
0,Lux Black XL,261187.0,21947.28
0,Lyft,76316.85,21916.79
0,Lyft XL,123695.6,22059.06
0,Shared,48162.0,21887.74
1,Black,179532.0,23882.24
1,Black SUV,263908.5,23809.24
1,UberPool,75015.5,23413.93
1,UberX,84022.5,23614.21


In [20]:
# Calculating the cost of Cab_type per KiloMeter
df.groupby(['cab_provider','cab_type']).agg({'fare':['sum']}).values / df.groupby(['cab_provider','cab_type']).agg({'distance':['sum']}).values

array([[ 6.57082244],
       [ 8.44178003],
       [11.90065466],
       [ 3.48211805],
       [ 5.60747375],
       [ 2.20040991],
       [ 7.5173853 ],
       [11.08428912],
       [ 3.20388333],
       [ 3.55813301],
       [ 5.71116595],
       [ 3.55852122]])

In [21]:
# Creating new feature with the above data
df['USD/KM'] = df.cab_type.replace({'Lux':6.6, 'Lux Black':8.4, 'Lux Black XL':11.9, 'Lyft':3.5, 'Lyft XL':5.6, 'Shared':2.2,
                       'Black':7.5, 'Black SUV':11.1, 'UberPool':3.2, 'UberX':3.5, 'UberXL':5.7, 'WAV':3.6})

In [22]:
#Encoding USD/KM column with 3 category
def usd_km_convert(price):
    if 2.2<= price <=4:
        return 'Budget_class'
    elif 4< price <=7.5:
        return 'Mid_class'
    else:
        return 'High_class'

In [23]:
df['USD/KM'] = list(map(usd_km_convert, df['USD/KM']))

In [24]:
# One-Hot-Encoding the new USD/KM column
df = pd.get_dummies(df, columns=['USD/KM'])

In [25]:
# One-Hot-Encoding the new cab_type column
df = pd.get_dummies(df, columns=['cab_type'])

In [26]:
df.source.unique()

array(['Boston University', 'South Station', 'Theatre District',
       'Northeastern University', 'Beacon Hill', 'North Station',
       'West End', 'North End', 'Haymarket Square', 'Financial District',
       'Fenway', 'Back Bay'], dtype=object)

In [27]:
# One-Hot-Encoding the new source column
df = pd.get_dummies(df, columns=['source'])

In [28]:
df.distance

0         3.03
1         1.30
2         2.71
3         2.43
4         2.71
          ... 
124995    3.05
124996    3.05
124997    3.05
124998    2.96
124999    2.96
Name: distance, Length: 125000, dtype: float64

In [29]:
# Dividing the Distance column into 4 intervals
df.distance = pd.cut(df['distance'], 4)

In [30]:
# After dividing, label encode them 
df['distance']= label_encoder.fit_transform(df['distance'])

In [31]:
# One-Hot-Encoding the destination column
df = pd.get_dummies(df, columns=['destination'])

In [32]:
df['surge_multiplier'].value_counts()

1.00    120894
1.25      2168
1.50       974
1.75       500
2.00       426
2.50        31
3.00         7
Name: surge_multiplier, dtype: int64

In [33]:
# One-Hot-Encoding the new surge_multiplier column
df['surge_multiplier'] = label_encoder.fit_transform(df['surge_multiplier'])

In [34]:
# Returning whether multiplier applied or not
def multi(num):
    if num==1.0:
        return 0
    else:
        return 1

In [35]:
# Creating a column of that data
df['multiplier_applied'] = list(map(multi, df['surge_multiplier']))

In [36]:
# Splitting the original data back into its original form
train_df = df[:100000]
test_df = df[100000:].drop('fare', axis=1)

In [37]:
# Splitting the features and target columns
X = train_df.drop('fare', axis=1)
y = train_df['fare']/np.array(train_df['fare'].mean())

In [38]:
# Splitting the train data train and test for model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [39]:
# Initiating the XGB Model with hyperparameters
xgb_model = XGBRegressor(base_score=0.4, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4603, gamma=0.05,
             gpu_id=0, importance_type='gain', interaction_constraints='',
             learning_rate=0.999, max_delta_step=1, max_depth=4,
             min_child_weight=1.7817, monotone_constraints='()',
             n_estimators=220, n_jobs=3, nthread=-1, num_parallel_tree=5,
             random_state=8, reg_alpha=0.364, reg_lambda=0.671,
             scale_pos_weight=1, subsample=0.5213,silent = True,tree_method='exact',
             validate_parameters=1, verbosity=0)

In [40]:
# Fitting the data, predicting on the splitted test data and calculating the mean squared error
xgb_model.fit(X_test.values, y_test.values)
pred = xgb_model.predict(X_test.values)
metrics.mean_squared_error(y_test, pred)

0.010219205956161478

In [41]:
# Predicting on the original test data
Prediction = xgb_model.predict(test_df.values)*(np.array(train_df['fare'].mean()))

In [42]:
# Creating a empty DataFrame
submission = pd.DataFrame()

In [43]:
# Putting the prediction in submission csv file
submission['fare'] = Prediction

In [44]:
submission

Unnamed: 0,fare
0,11.982113
1,26.594723
2,17.443310
3,13.308193
4,10.503002
...,...
24995,9.878730
24996,10.726671
24997,33.247150
24998,7.165320
