In [212]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [213]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
copy_train_pd.info()


running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

In [214]:
def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
    df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS

In [215]:
x_train, x_test = train_test_split(train_pd, test_size= 0.1)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [216]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()

convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')
x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train_copy, x_test_copy = x_train.copy(), x_test.copy()

In [217]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152037.0,157500.0,152037.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728825,1.357788,5.30021,1.509356,132.665746,132.645219,1.931742,6.127298,0.646439,0.979567,...,2023.0,6.004254,29.060533,15.126273,29.510444,2023.0,6.007702,28.970476,15.104375,29.484629
std,0.445366,0.891288,416.332511,6.450823,76.138128,76.200752,1.94726,4.632049,2.33034,0.199404,...,0.0,0.065084,1.976292,5.774278,17.30174,0.0,0.08742,2.570718,5.957759,17.339536
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-29.3,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472838,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,1.0,5.286994,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.6,1.0,199.0,199.0,2.5,7.49966,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [218]:
x_train.isna().sum()

VendorID                           0
passenger_count                 5463
trip_distance                      0
RatecodeID                      5463
store_and_fwd_flag              5463
PULocationID                       0
DOLocationID                       0
payment_type                       0
extra                              0
tip_amount                         0
tolls_amount                       0
improvement_surcharge              0
total_amount                       0
congestion_surcharge            5463
Airport_fee                     5463
tpep_pickup_datetime_Year          0
tpep_pickup_datetime_Month         0
tpep_pickup_datetime_Day           0
tpep_pickup_datetime_Hour          0
tpep_pickup_datetime_Minute        0
tpep_dropoff_datetime_Year         0
tpep_dropoff_datetime_Month        0
tpep_dropoff_datetime_Day          0
tpep_dropoff_datetime_Hour         0
tpep_dropoff_datetime_Minute       0
dtype: int64

In [219]:
def impute_unknown_values(X:pd.DataFrame):
  X['passenger_count'] = X['passenger_count'].replace(np.nan, 1)
  X['RatecodeID'] = X['RatecodeID'].replace(np.nan, 1)
  X['store_and_fwd_flag'] = X['store_and_fwd_flag'].replace(np.nan, 'N')
  X['congestion_surcharge'] = X['congestion_surcharge'].replace(np.nan, 0)
  X['Airport_fee'] = X['Airport_fee'].replace(np.nan, 0)
  print(X.isna().sum())

In [188]:
x_train['passenger_count'] = x_train['passenger_count'].replace(np.nan, 1)
x_train['RatecodeID'] = x_train['RatecodeID'].replace(np.nan, 1)
x_train['store_and_fwd_flag'] = x_train['store_and_fwd_flag'].replace(np.nan, 'N')
x_train['congestion_surcharge'] = x_train['congestion_surcharge'].replace(np.nan, 0)
x_train['Airport_fee'] = x_train['Airport_fee'].replace(np.nan, 0)
x_train.isna().sum()

VendorID                        0
passenger_count                 0
trip_distance                   0
RatecodeID                      0
store_and_fwd_flag              0
PULocationID                    0
DOLocationID                    0
payment_type                    0
extra                           0
tip_amount                      0
tolls_amount                    0
improvement_surcharge           0
total_amount                    0
congestion_surcharge            0
Airport_fee                     0
tpep_pickup_datetime_Year       0
tpep_pickup_datetime_Month      0
tpep_pickup_datetime_Day        0
tpep_pickup_datetime_Hour       0
tpep_pickup_datetime_Minute     0
tpep_dropoff_datetime_Year      0
tpep_dropoff_datetime_Month     0
tpep_dropoff_datetime_Day       0
tpep_dropoff_datetime_Hour      0
tpep_dropoff_datetime_Minute    0
dtype: int64

### OUTLIER removal

In [220]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152037.0,157500.0,152037.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728825,1.357788,5.30021,1.509356,132.665746,132.645219,1.931742,6.127298,0.646439,0.979567,...,2023.0,6.004254,29.060533,15.126273,29.510444,2023.0,6.007702,28.970476,15.104375,29.484629
std,0.445366,0.891288,416.332511,6.450823,76.138128,76.200752,1.94726,4.632049,2.33034,0.199404,...,0.0,0.065084,1.976292,5.774278,17.30174,0.0,0.08742,2.570718,5.957759,17.339536
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-29.3,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472838,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,1.0,5.286994,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.6,1.0,199.0,199.0,2.5,7.49966,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [190]:
# x_train = x_train[x_train['passenger_count'] <= 6] ## Passanger count filtering
# x_train = x_train[x_train['trip_distance'] <= 30] ## trip distance filtering
# x_train = x_train[x_train['tolls_amount'] >=0] # filtering tolls amount
# x_train = x_train[x_train['improvement_surcharge'] >= 0] # removing improvement surcharge
# x_train = x_train[x_train['tip_amount'] <= 100] # removing extra tip amount
# x_train = x_train[x_train['total_amount'] <= 150]
##* Without outlier removal i am scoring 0.75


In [191]:
print('number of extra elements with neg elements', len(x_train[x_train.tolls_amount < 0]))
print('improvement surcharge', len(x_train[x_train['improvement_surcharge'] < 0]))

number of extra elements with neg elements 0
improvement surcharge 0


In [192]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,...,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0,155466.0
mean,0.725709,1.343799,3.604328,1.495452,132.759722,132.684413,1.965033,6.109746,0.627226,0.999394,...,2023.0,6.004271,29.059917,15.122606,29.499022,2023.0,6.007744,28.968598,15.109741,29.529717
std,0.446965,0.876687,4.590174,6.425981,76.148262,76.182353,1.920135,4.221987,2.182744,0.022425,...,0.0,0.065214,1.979704,5.765894,17.306853,0.0,0.087661,2.577248,5.94381,17.33029
min,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.000275,0.0,0.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.09,1.0,67.0,67.0,0.0,3.507082,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,2.5,5.322255,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.6,1.0,199.0,198.0,2.5,7.518317,0.0,1.0,...,2023.0,6.0,30.0,20.0,44.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,6.0,30.0,99.0,264.0,264.0,11.75,95.282068,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [193]:
print(len(x_train[x_train['trip_distance'] > 20]))
print(len(x_train[x_train['tip_amount'] > 100]))
print('total amount greater than 300', len(x_train[x_train['total_amount'] > 125]))

1837
0
total amount greater than 300 461


In [221]:
x_train['VendorID'].unique()

array([1, 0, 2])

In [222]:
vendor_id_pipe = Pipeline([
  ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])
passanger_count_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
    ('std_scaler', StandardScaler())
])
rate_code_id_pipe = Pipeline([
       ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant',
                                         fill_value= 1)),
       ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')) ## TODO: experimental changes
])
store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

congestion_charger_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scaler', StandardScaler())
])
airport_fee_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scale', StandardScaler())
])
payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

In [223]:
feature_scaling_and_trans = ColumnTransformer([
  ('vendor_id', vendor_id_pipe, ['VendorID']),
  ('passanger_count', passanger_count_pipe, ['passenger_count']),
  ('rate_code_id', rate_code_id_pipe, ['RatecodeID']), # experimental changes
  ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
  ('pu_loc', StandardScaler(), ['PULocationID']),
  ('du_loc', StandardScaler(), ["DOLocationID"]),
  ('payment_t', payment_type_pipe, ['payment_type']),
  ('extra_t', StandardScaler(), ['extra']),
  ('tip_amount_t', StandardScaler(), ['tip_amount']),
  ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
  ('improvement_c', StandardScaler(), ['improvement_surcharge']),
  ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
  ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
  ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder= 'passthrough')

In [224]:
linear_reg = LinearRegression()


In [225]:
y = x_train['total_amount']
X = x_train.drop('total_amount', axis= 1)

In [226]:
X_t = feature_scaling_and_trans.fit_transform(X=X)

In [227]:
com_train_features, dev_test_features, com_train_labels, dev_test_labels = train_test_split(X_t, y, test_size= 0.2)

In [228]:
linear_reg.fit(com_train_features, com_train_labels)
linear_reg.score(com_train_features, com_train_labels)

0.7779933815179066

In [229]:
linear_reg.score(dev_test_features, dev_test_labels)


0.7695233494359852

In [230]:
impute_unknown_values(test_pd)
convert_dt_obj_to_datetime(test_pd, 'tpep_pickup_datetime')
test_pd = test_pd.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(test_pd, 'tpep_dropoff_datetime')
test_pd = test_pd.drop('tpep_dropoff_datetime', axis= 1)


VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
extra                    0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
congestion_surcharge     0
Airport_fee              0
dtype: int64


In [231]:
test_pd_t = feature_scaling_and_trans.transform(test_pd)

In [232]:
prediction = linear_reg.predict(test_pd_t)

In [233]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = prediction
submission.to_csv('submission.csv', index=False)

In [234]:
submission.describe()

Unnamed: 0,ID,total_amount
count,50000.0,50000.0
mean,25000.5,29.399835
std,14433.901067,21.973214
min,1.0,-89.800735
25%,12500.75,19.618898
50%,25000.5,23.22443
75%,37500.25,28.213083
max,50000.0,280.238701
