In [228]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [229]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
copy_train_pd.info()


running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

In [230]:
def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
    df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS

In [231]:
x_train, x_test = train_test_split(train_pd, test_size= 0.1)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [232]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()

convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')
x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train_copy, x_test_copy = x_train.copy(), x_test.copy()

In [233]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152046.0,157500.0,152046.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728387,1.358457,5.312028,1.516719,132.746127,132.718044,1.931905,6.125525,0.647401,0.979526,...,2023.0,6.004229,29.062108,15.11659,29.505975,2023.0,6.007613,28.973575,15.104902,29.482006
std,0.445663,0.892179,416.332797,6.503767,76.121124,76.1367,1.952081,4.615609,2.33555,0.19955,...,0.0,0.06489,1.971177,5.768925,17.293754,0.0,0.086918,2.557154,5.946881,17.337268
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-29.3,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.473088,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,1.0,5.284609,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.61,1.0,199.0,199.0,2.5,7.497668,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [234]:
x_train.isna().sum()

VendorID                           0
passenger_count                 5454
trip_distance                      0
RatecodeID                      5454
store_and_fwd_flag              5454
PULocationID                       0
DOLocationID                       0
payment_type                       0
extra                              0
tip_amount                         0
tolls_amount                       0
improvement_surcharge              0
total_amount                       0
congestion_surcharge            5454
Airport_fee                     5454
tpep_pickup_datetime_Year          0
tpep_pickup_datetime_Month         0
tpep_pickup_datetime_Day           0
tpep_pickup_datetime_Hour          0
tpep_pickup_datetime_Minute        0
tpep_dropoff_datetime_Year         0
tpep_dropoff_datetime_Month        0
tpep_dropoff_datetime_Day          0
tpep_dropoff_datetime_Hour         0
tpep_dropoff_datetime_Minute       0
dtype: int64

In [235]:
def impute_unknown_values(X:pd.DataFrame):
  X['passenger_count'] = X['passenger_count'].replace(np.nan, 1)
  X['RatecodeID'] = X['RatecodeID'].replace(np.nan, 1)
  X['store_and_fwd_flag'] = X['store_and_fwd_flag'].replace(np.nan, 'N')
  X['congestion_surcharge'] = X['congestion_surcharge'].replace(np.nan, 0)
  X['Airport_fee'] = X['Airport_fee'].replace(np.nan, 0)
  print(X.isna().sum())

In [236]:
x_train['passenger_count'] = x_train['passenger_count'].replace(np.nan, 1)
x_train['RatecodeID'] = x_train['RatecodeID'].replace(np.nan, 1)
x_train['store_and_fwd_flag'] = x_train['store_and_fwd_flag'].replace(np.nan, 'N')
x_train['congestion_surcharge'] = x_train['congestion_surcharge'].replace(np.nan, 0)
x_train['Airport_fee'] = x_train['Airport_fee'].replace(np.nan, 0)
x_train.isna().sum()

VendorID                        0
passenger_count                 0
trip_distance                   0
RatecodeID                      0
store_and_fwd_flag              0
PULocationID                    0
DOLocationID                    0
payment_type                    0
extra                           0
tip_amount                      0
tolls_amount                    0
improvement_surcharge           0
total_amount                    0
congestion_surcharge            0
Airport_fee                     0
tpep_pickup_datetime_Year       0
tpep_pickup_datetime_Month      0
tpep_pickup_datetime_Day        0
tpep_pickup_datetime_Hour       0
tpep_pickup_datetime_Minute     0
tpep_dropoff_datetime_Year      0
tpep_dropoff_datetime_Month     0
tpep_dropoff_datetime_Day       0
tpep_dropoff_datetime_Hour      0
tpep_dropoff_datetime_Minute    0
dtype: int64

### OUTLIER removal

In [237]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728387,1.346044,5.312028,1.498825,132.746127,132.718044,1.931905,6.125525,0.647401,0.979526,...,2023.0,6.004229,29.062108,15.11659,29.505975,2023.0,6.007613,28.973575,15.104902,29.482006
std,0.445663,0.879042,416.332797,6.390864,76.121124,76.1367,1.952081,4.615609,2.33555,0.19955,...,0.0,0.06489,1.971177,5.768925,17.293754,0.0,0.086918,2.557154,5.946881,17.337268
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-29.3,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.473088,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,1.0,5.284609,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.61,1.0,199.0,199.0,2.5,7.497668,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [238]:
print('total number of airport fees negative', len(x_train[x_train['Airport_fee'] < 0]))
print('total congestion charge with negative value', len(x_train[x_train['congestion_surcharge'] < 0]))
print('total number of improvement charge in -ve', len(x_train[x_train['improvement_surcharge'] < 0]))

total number of airport fees negative 232
total congestion charge with negative value 1260
total number of improvement charge in -ve 1565


In [239]:
x_train = x_train[x_train['passenger_count'] <= 6] ## Passanger count filtering
# x_train = x_train[x_train['trip_distance'] <= 30] ## trip distance filtering
# x_train = x_train[x_train['tolls_amount'] >=0] # filtering tolls amount
# x_train = x_train[x_train['Airport_fee'] >= 0]
# x_train = x_train[x_train['congestion_surcharge'] >= 0]
x_train = x_train[x_train['improvement_surcharge'] >= 0] # removing improvement surcharge
x_train = x_train[x_train['tip_amount'] <= 100] # removing extra tip amount
x_train = x_train[x_train['total_amount'] <= 150]
##* Without outlier removal i am scoring 0.75


In [240]:
print('number of extra elements with neg elements', len(x_train[x_train.tolls_amount < 0]))
print('improvement surcharge', len(x_train[x_train['improvement_surcharge'] < 0]))

number of extra elements with neg elements 0
improvement surcharge 0


In [241]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,...,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0,155484.0
mean,0.725477,1.345116,5.26613,1.495356,132.75256,132.702265,1.96551,6.105045,0.62689,0.999391,...,2023.0,6.004226,29.061415,15.119569,29.508509,2023.0,6.007596,28.973168,15.10976,29.486462
std,0.447153,0.879331,419.019857,6.430281,76.112621,76.113609,1.925208,4.214338,2.188729,0.022345,...,0.0,0.064867,1.970575,5.764097,17.291767,0.0,0.086822,2.554556,5.940162,17.336744
min,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.000275,0.0,0.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.09,1.0,67.0,67.0,0.0,3.506856,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.84,1.0,133.0,133.0,1.75,5.321556,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.6,1.0,199.0,199.0,2.5,7.512062,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,6.0,135182.06,99.0,264.0,264.0,11.75,95.282068,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [242]:
print(len(x_train[x_train['trip_distance'] > 20]))
print(len(x_train[x_train['tip_amount'] > 100]))
print('total amount greater than 300', len(x_train[x_train['total_amount'] > 125]))

1870
0
total amount greater than 300 469


In [243]:
x_train['VendorID'].unique()

array([1, 0, 2])

In [244]:
vendor_id_pipe = Pipeline([
  ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])
passanger_count_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
    ('std_scaler', StandardScaler())
])
rate_code_id_pipe = Pipeline([
       ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant',
                                         fill_value= 1)),
       ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')) ## TODO: experimental changes
])
store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

congestion_charger_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scaler', StandardScaler())
])
airport_fee_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scale', StandardScaler())
])
payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

In [245]:
feature_scaling_and_trans = ColumnTransformer([
  ('vendor_id', vendor_id_pipe, ['VendorID']),
  ('passanger_count', passanger_count_pipe, ['passenger_count']),
  ('rate_code_id', rate_code_id_pipe, ['RatecodeID']), # experimental changes
  ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
  ('pu_loc', StandardScaler(), ['PULocationID']),
  ('du_loc', StandardScaler(), ["DOLocationID"]),
  ('payment_t', payment_type_pipe, ['payment_type']),
  ('extra_t', StandardScaler(), ['extra']),
  ('tip_amount_t', StandardScaler(), ['tip_amount']),
  ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
  ('improvement_c', StandardScaler(), ['improvement_surcharge']),
  ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
  ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
  ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder= 'passthrough')

In [246]:
linear_reg = LinearRegression()


In [247]:
y = x_train['total_amount']
X = x_train.drop('total_amount', axis= 1)

In [248]:
X_t = feature_scaling_and_trans.fit_transform(X=X)

In [249]:
com_train_features, dev_test_features, com_train_labels, dev_test_labels = train_test_split(X_t, y, test_size= 0.2)

In [250]:
linear_reg.fit(com_train_features, com_train_labels)
linear_reg.score(com_train_features, com_train_labels)

0.800246016814998

In [251]:
linear_reg.score(dev_test_features, dev_test_labels)


0.799958440327103

### Recursive features elimination

In [252]:
from sklearn.feature_selection import RFE

selector = RFE(linear_reg, n_features_to_select= 25, step=1, verbose= 1)
selector = selector.fit(com_train_features, com_train_labels)


Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.


In [253]:
selector.support_

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True, False, False, False,  True,  True, False, False, False])

In [254]:
selector.ranking_

array([ 1,  1,  1,  6,  1,  1,  1,  1,  1,  1,  1,  1,  9,  8,  1,  1,  1,
        1,  1,  1,  1,  1,  7,  2,  1,  1,  1,  1,  3,  5, 11,  1,  1, 12,
        4, 10])

In [255]:
selector.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x24',
       'x25', 'x26', 'x27', 'x31', 'x32'], dtype=object)

In [256]:
selector.score(dev_test_features, dev_test_labels)

0.7932979754226075

## checking cross validation

In [257]:
def printTestResult(result, model:str):
  train_error, test_error = -1 * result['train_score'],  -1 * result['test_score']
  print(f'Mean absolute error of {model} model on the train set:\n',
      f"{train_error.mean():.3f} +/- {train_error.std():.3f}")

  print(f'Mean absolute error of linear {model} on the test set:\n',
      f'{test_error.mean():.3f} +/- {test_error.std():.3f}')

In [258]:
cv_5 = ShuffleSplit(n_splits= 5, test_size= 0.2, random_state= 42)
lin_reg_cross_validation = cross_validate(
  linear_reg,
  com_train_features,
  com_train_labels,
  cv=cv_5,
  scoring= 'neg_mean_squared_error',
  return_train_score= True,
  return_estimator= True)
printTestResult(lin_reg_cross_validation, 'Linear Regression')
lin_reg_cross_validation['estimator'][0].score(dev_test_features, dev_test_labels)

Mean absolute error of Linear Regression model on the train set:
 100.053 +/- 0.191
Mean absolute error of linear Linear Regression on the test set:
 146.087 +/- 56.976


0.7999364329647363

### checking grid search cv

In [259]:
params = {
  'ridge__alpha': [0.1, 0.01, 0.001, 0.5, 0.05, 0.005]
}

In [260]:
lasso_regression = Lasso()
lasso_regression.fit(com_train_features, com_train_labels)

In [261]:
print(lasso_regression.score(com_train_features, com_train_labels))
print(lasso_regression.score(dev_test_features, dev_test_labels))


0.7422686602348579
0.7382760365903587


In [262]:
ridge_regression = Ridge()
ridge_regression.fit(com_train_features, com_train_labels)

In [263]:
print(ridge_regression.score(com_train_features, com_train_labels))
print(ridge_regression.score(dev_test_features, dev_test_labels))

0.8002217145075132
0.7999051657061976


In [264]:
impute_unknown_values(test_pd)
convert_dt_obj_to_datetime(test_pd, 'tpep_pickup_datetime')
test_pd = test_pd.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(test_pd, 'tpep_dropoff_datetime')
test_pd = test_pd.drop('tpep_dropoff_datetime', axis= 1)


VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
extra                    0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
congestion_surcharge     0
Airport_fee              0
dtype: int64


In [265]:
test_pd_t = feature_scaling_and_trans.transform(test_pd)

In [266]:
prediction = lin_reg_cross_validation['estimator'][0].predict(test_pd_t)

In [267]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = prediction
submission.to_csv('submission.csv', index=False)

In [268]:
submission.describe()

Unnamed: 0,ID,total_amount
count,50000.0,50000.0
mean,25000.5,29.578676
std,14433.901067,20.529511
min,1.0,-30.466988
25%,12500.75,19.550927
50%,25000.5,23.331674
75%,37500.25,28.561047
max,50000.0,269.327316


In [269]:
submission[submission['total_amount'] < 0]

Unnamed: 0,ID,total_amount
2789,2790,-11.067869
3553,3554,-15.571978
5360,5361,-24.987785
6854,6855,-30.466988
7394,7395,-1.814964
8193,8194,-5.257015
10145,10146,-6.352467
10731,10732,-14.251629
10905,10906,-28.875216
11055,11056,-4.530935
