### Texi guru challange appling random forest

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Recursive feature elemination
from sklearn.feature_selection import RFE

# Model
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# split of data features
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

# scoring of features
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [3]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
train_pd.info()

running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

In [4]:
print('length of data point which has negative or zero price', len(copy_train_pd[copy_train_pd ['total_amount'] <= 0]))

length of data point which has negative or zero price 1747


In [5]:
train_pd = copy_train_pd.copy()
x_train, x_test = train_test_split(train_pd, test_size= 0.2)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [6]:
print(f'''
unique values of passanger_count is: {x_train['passenger_count'].unique()}\n
unique values of RateCodeId is: {x_train['RatecodeID'].unique()}\n
unique values of store_and_fwd_flag is: {x_train['store_and_fwd_flag'].unique()}\n
unique values of congestion_surcharge is: {x_train['congestion_surcharge'].unique()}\n
unqiue values of airport_fees is: {x_train['Airport_fee'].unique()}\n
''')



unique values of passanger_count is: [ 1. nan  2.  0.  4.  3.  5.  6.  8.]

unique values of RateCodeId is: [ 1. nan  2.  5.  4. 99.  3.]

unique values of store_and_fwd_flag is: ['N' nan 'Y']

unique values of congestion_surcharge is: [ 2.5  nan  0.  -2.5]

unqiue values of airport_fees is: [ 0.     nan  1.75 -1.75]




In [7]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
count,140000.0,135175.0,140000.0,135175.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,135175.0,135175.0
mean,0.728729,1.359993,4.852415,1.518898,132.423621,132.650064,1.930083,6.122707,0.645142,0.97965,29.620171,2.247475,0.157581
std,0.445452,0.893943,366.279649,6.517097,76.11613,76.157903,1.944759,4.657349,2.321825,0.198932,25.535256,0.819104,0.510123
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-29.3,-1.0,-576.75,-2.5,-1.75
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.473769,0.0,1.0,16.3,2.5,0.0
50%,1.0,1.0,1.83,1.0,133.0,133.0,1.0,5.276449,0.0,1.0,21.4,2.5,0.0
75%,1.0,1.0,3.6,1.0,198.0,199.0,2.5,7.494876,0.0,1.0,31.75,2.5,0.0
max,2.0,8.0,135182.06,99.0,264.0,264.0,11.75,484.876151,76.0,1.0,587.25,2.5,1.75


In [8]:
def convert_dt_str_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])


def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    # df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
#     df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS

def revert_date_time_if_reversed(df:pd.DataFrame, col1:str, col2:str):
    idx = (df[col2] < df[col1])
    df.loc[idx, [col1, col2]] = df.loc[idx, [col2, col1]]


In [9]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()
# df[col_name] = pd.to_datetime(df[col_name])
convert_dt_str_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_str_to_datetime(x_test, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')


convert_dt_str_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_str_to_datetime(x_test, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')



### swaping the values of drop off and pickup time if they are in reverse order
# revert_date_time_if_reversed(x_train, 'tpep_pickup_datetime', 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)
x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train.info()
x_train_copy, x_test_copy = x_train.copy(), x_test.copy()

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 71759 to 136675
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   VendorID                     140000 non-null  int64  
 1   passenger_count              135175 non-null  float64
 2   trip_distance                140000 non-null  float64
 3   RatecodeID                   135175 non-null  float64
 4   store_and_fwd_flag           135175 non-null  object 
 5   PULocationID                 140000 non-null  int64  
 6   DOLocationID                 140000 non-null  int64  
 7   payment_type                 140000 non-null  object 
 8   extra                        140000 non-null  float64
 9   tip_amount                   140000 non-null  float64
 10  tolls_amount                 140000 non-null  float64
 11  improvement_surcharge        140000 non-null  float64
 12  total_amount                 140000 non-null  float64
 13  

In [10]:
cv_shuffle_20 = ShuffleSplit(n_splits= 20, test_size= 0.1)
cv_shuffle_5 = ShuffleSplit(n_splits= 5, test_size= 0.1)

In [11]:
x_train = x_train_copy.copy()
y_train = x_train['total_amount']
x_train = x_train.drop('total_amount', axis= 1)

x_test = x_test_copy.copy()
y_test = x_test['total_amount']
x_test = x_test.drop('total_amount', axis= 1)

In [12]:
com_train_features, dev_train_features, com_train_label, dev_train_label = train_test_split(x_train, y_train, test_size= 0.2)

In [13]:
vendor_id_pipe = Pipeline([
  ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])
passanger_count_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
    ('std_scaler', StandardScaler())
])

rate_code_id_pipe = Pipeline([
       ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant',
                                         fill_value= 1)),
       ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore')) ## TODO: experimental changes
])

store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])
congestion_charger_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scaler', StandardScaler())
])

airport_fee_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scale', StandardScaler())
])

payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])

In [14]:
feature_scaling_pipeline = ColumnTransformer([
  ('vendor_id', vendor_id_pipe, ['VendorID']),
  ('passanger_count', passanger_count_pipe, ['passenger_count']),
  ('rate_code_id', rate_code_id_pipe, ['RatecodeID']), # experimental changes
  ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
  ('pu_loc', StandardScaler(), ['PULocationID']),
  ('du_loc', StandardScaler(), ["DOLocationID"]),
  ('payment_t', payment_type_pipe, ['payment_type']),
  ('extra_t', StandardScaler(), ['extra']),
  ('tip_amount_t', StandardScaler(), ['tip_amount']),
  ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
  ('improvement_c', StandardScaler(), ['improvement_surcharge']),
  ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
  ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
  ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder='passthrough')
feature_scaling_pipeline

In [15]:
model = Pipeline(steps= [
    ('feature_scaling', feature_scaling_pipeline),
    ('dt_reg', RandomForestRegressor(n_jobs= -1))
])
model

In [16]:
# model.fit(com_train_features, com_train_label)
from sklearn.model_selection import cross_validate 
cross_val_result = cross_validate(model, com_train_features, com_train_label,
                                  cv= cv_shuffle_5,
                                  return_estimator= True,
                                  return_train_score= True, 
                                  verbose= 3,
                                  )
cross_val_result


[CV] END ..................., score=(train=0.991, test=0.914) total time=  54.9s
[CV] END ..................., score=(train=0.992, test=0.909) total time=  26.6s
[CV] END ..................., score=(train=0.991, test=0.920) total time=  26.3s
[CV] END ..................., score=(train=0.991, test=0.886) total time=  27.1s
[CV] END ..................., score=(train=0.991, test=0.955) total time=  30.2s


{'fit_time': array([53.86429667, 26.35188031, 26.07330704, 26.89994955, 29.9424789 ]),
 'score_time': array([1.05475163, 0.28245234, 0.24672937, 0.22000885, 0.22660923]),
 'estimator': [Pipeline(steps=[('feature_scaling',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('vendor_id',
                                                    Pipeline(steps=[('one_hot_encoder',
                                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                                    ['VendorID']),
                                                   ('passanger_count',
                                                    Pipeline(steps=[('simple_imputer',
                                                                     SimpleImputer(fill_value=1,
                                                                                   strategy='constant')),
                       

In [25]:
model = cross_val_result['estimator'][-1]
model

In [26]:
from sklearn import tree
plt.figure(figsize= (28, 8), facecolor= 'w')
# tree_vz = tree.plot_tree(dt_pipeline[-1],
#                          feature_names= dt_pipeline[0].get_feature_names_out(),
#                          rounded= True,
#                          filled= True,
#                          fontsize= 12)

<Figure size 2800x800 with 0 Axes>

<Figure size 2800x800 with 0 Axes>

In [27]:
# from sklearn.tree import export_text 
# tree_rules = export_text(dt_pipeline[-1])
# print(tree_rules)


In [28]:
y_hat = model.predict(com_train_features)
r2_val = r2_score(com_train_label, y_hat)
print('r2 score of the dt is:', r2_val)

r2 score of the dt is: 0.987421302288892


In [29]:
dev_y_hat = model.predict(dev_train_features)
r2_dev_score = r2_score(dev_train_label, dev_y_hat)
print('train score is:', r2_dev_score)

train score is: 0.9376533434990352


In [30]:
main_y_test = model.predict(x_test)
r2_main_test = r2_score(y_test, main_y_test)
print('main test score', r2_main_test)

main test score 0.9518362503917654


In [31]:
len(main_y_test[main_y_test < 0])

343

## submission code

In [32]:
# impute_unknown_values(test_pd)
convert_dt_str_to_datetime(test_pd, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(test_pd, 'tpep_pickup_datetime')

convert_dt_str_to_datetime(test_pd, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(test_pd, 'tpep_dropoff_datetime')

revert_date_time_if_reversed(test_pd, 'tpep_pickup_datetime', 'tpep_dropoff_datetime')

test_pd = test_pd.drop('tpep_pickup_datetime', axis= 1)
test_pd = test_pd.drop('tpep_dropoff_datetime', axis= 1)


In [33]:
test_pd_prediction = model.predict(test_pd)

In [34]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = test_pd_prediction
submission.to_csv('submission.csv', index=False)

In [35]:
submission.describe()

Unnamed: 0,ID,total_amount
count,50000.0,50000.0
mean,25000.5,29.624698
std,14433.901067,24.594664
min,1.0,-165.2858
25%,12500.75,16.7258
50%,25000.5,21.56955
75%,37500.25,31.6033
max,50000.0,459.2502


In [36]:
submission[submission['total_amount'] < 0]

Unnamed: 0,ID,total_amount
12,13,-5.2820
99,100,-42.3745
133,134,-14.1140
199,200,-13.8830
419,420,-23.3600
...,...,...
49639,49640,-49.0710
49714,49715,-11.1600
49741,49742,-12.5060
49858,49859,-7.0980
