### Texi guru challange appling random forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Recursive feature elemination
from sklearn.feature_selection import RFE

# Model
from sklearn.tree import DecisionTreeRegressor

# split of data features
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

# scoring of features
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
train_pd.info()

running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

In [3]:
train_pd = copy_train_pd.copy()
x_train, x_test = train_test_split(train_pd, test_size= 0.2)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [4]:
print(f'''
unique values of passanger_count is: {x_train['passenger_count'].unique()}\n
unique values of RateCodeId is: {x_train['RatecodeID'].unique()}\n
unique values of store_and_fwd_flag is: {x_train['store_and_fwd_flag'].unique()}\n
unique values of congestion_surcharge is: {x_train['congestion_surcharge'].unique()}\n
unqiue values of airport_fees is: {x_train['Airport_fee'].unique()}\n
''')



unique values of passanger_count is: [ 1.  4.  2.  5.  0.  3. nan  6.  8.  9.]

unique values of RateCodeId is: [ 5.  1.  2. nan  4.  3. 99.]

unique values of store_and_fwd_flag is: ['N' 'Y' nan]

unique values of congestion_surcharge is: [ 2.5 -2.5  0.   nan]

unqiue values of airport_fees is: [ 0.    1.75   nan -1.75]




In [5]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
count,140000.0,135139.0,140000.0,135139.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,140000.0,135139.0,135139.0
mean,0.728893,1.357506,4.853969,1.514633,132.666821,132.75585,1.932156,6.126628,0.644207,0.980041,29.635399,2.248074,0.159203
std,0.445352,0.893212,366.279282,6.490824,76.069569,76.206053,1.946526,4.653141,2.311896,0.196988,25.328137,0.816864,0.512133
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,7.9e-05,-26.55,-1.0,-576.75,-2.5,-1.75
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472312,0.0,1.0,16.3,2.5,0.0
50%,1.0,1.0,1.84,1.0,133.0,133.0,1.0,5.288491,0.0,1.0,21.45,2.5,0.0
75%,1.0,1.0,3.61,1.0,198.0,199.0,2.5,7.500487,0.0,1.0,31.85,2.5,0.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,587.25,2.5,1.75


In [6]:
def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
#     df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS


In [7]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()

convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')
x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train.info()
x_train_copy, x_test_copy = x_train.copy(), x_test.copy()

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 126012 to 110456
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   VendorID                     140000 non-null  int64  
 1   passenger_count              135139 non-null  float64
 2   trip_distance                140000 non-null  float64
 3   RatecodeID                   135139 non-null  float64
 4   store_and_fwd_flag           135139 non-null  object 
 5   PULocationID                 140000 non-null  int64  
 6   DOLocationID                 140000 non-null  int64  
 7   payment_type                 140000 non-null  object 
 8   extra                        140000 non-null  float64
 9   tip_amount                   140000 non-null  float64
 10  tolls_amount                 140000 non-null  float64
 11  improvement_surcharge        140000 non-null  float64
 12  total_amount                 140000 non-null  float64
 13 

In [8]:
cv_shuffle_20 = ShuffleSplit(n_splits= 20, test_size= 0.1)
cv_shuffle_5 = ShuffleSplit(n_splits= 5, test_size= 0.1)

In [9]:
x_train = x_train_copy.copy()
y_train = x_train['total_amount']
x_train = x_train.drop('total_amount', axis= 1)

x_test = x_test_copy.copy()
y_test = x_test['total_amount']
x_test = x_test.drop('total_amount', axis= 1)

In [10]:
com_train_features, dev_train_features, com_train_label, dev_train_label = train_test_split(x_train, y_train, test_size= 0.2)

In [11]:
vendor_id_pipe = Pipeline([
  ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])
passanger_count_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
    ('std_scaler', StandardScaler())
])

rate_code_id_pipe = Pipeline([
       ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant',
                                         fill_value= 1)),
       ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore')) ## TODO: experimental changes
])

store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])
congestion_charger_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scaler', StandardScaler())
])

airport_fee_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('std_scale', StandardScaler())
])

payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown= 'ignore'))
])

In [12]:
feature_scaling_pipeline = ColumnTransformer([
  ('vendor_id', vendor_id_pipe, ['VendorID']),
  ('passanger_count', passanger_count_pipe, ['passenger_count']),
  ('rate_code_id', rate_code_id_pipe, ['RatecodeID']), # experimental changes
  ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
  ('pu_loc', StandardScaler(), ['PULocationID']),
  ('du_loc', StandardScaler(), ["DOLocationID"]),
  ('payment_t', payment_type_pipe, ['payment_type']),
  ('extra_t', StandardScaler(), ['extra']),
  ('tip_amount_t', StandardScaler(), ['tip_amount']),
  ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
  ('improvement_c', StandardScaler(), ['improvement_surcharge']),
  ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
  ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
  ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder='passthrough')
feature_scaling_pipeline

In [13]:
dt_pipeline = Pipeline(steps= [
    ('feature_scaling', feature_scaling_pipeline),
    ('dt_reg', DecisionTreeRegressor())
])
dt_pipeline

In [14]:
dt_pipeline.fit(com_train_features, com_train_label)

In [15]:
from sklearn import tree
plt.figure(figsize= (28, 8), facecolor= 'w')
# tree_vz = tree.plot_tree(dt_pipeline[-1],
#                          feature_names= dt_pipeline[0].get_feature_names_out(),
#                          rounded= True,
#                          filled= True,
#                          fontsize= 12)

<Figure size 2800x800 with 0 Axes>

<Figure size 2800x800 with 0 Axes>

In [18]:
from sklearn.tree import export_text 
tree_rules = export_text(dt_pipeline[-1])
print(tree_rules)


|--- feature_33 <= 7.88
|   |--- feature_33 <= 2.65
|   |   |--- feature_4 <= 0.50
|   |   |   |--- feature_22 <= -7.49
|   |   |   |   |--- feature_14 <= 0.50
|   |   |   |   |   |--- feature_13 <= -0.23
|   |   |   |   |   |   |--- feature_13 <= -0.93
|   |   |   |   |   |   |   |--- feature_8 <= 0.50
|   |   |   |   |   |   |   |   |--- feature_19 <= -2.28
|   |   |   |   |   |   |   |   |   |--- feature_13 <= -1.39
|   |   |   |   |   |   |   |   |   |   |--- value: [-78.25]
|   |   |   |   |   |   |   |   |   |--- feature_13 >  -1.39
|   |   |   |   |   |   |   |   |   |   |--- value: [-79.00]
|   |   |   |   |   |   |   |   |--- feature_19 >  -2.28
|   |   |   |   |   |   |   |   |   |--- value: [-74.00]
|   |   |   |   |   |   |   |--- feature_8 >  0.50
|   |   |   |   |   |   |   |   |--- feature_13 <= -1.14
|   |   |   |   |   |   |   |   |   |--- feature_28 <= 0.76
|   |   |   |   |   |   |   |   |   |   |--- value: [-54.30]
|   |   |   |   |   |   |   |   |   |--- feature_28

In [19]:
y_hat = dt_pipeline.predict(com_train_features)
r2_val = r2_score(com_train_label, y_hat)
print('r2 score of the dt is:', r2_val)

r2 score of the dt is: 1.0


In [20]:
dev_y_hat = dt_pipeline.predict(dev_train_features)
r2_dev_score = r2_score(dev_train_label, dev_y_hat)
print('train score is:', r2_dev_score)

train score is: 0.9001717631616253


In [21]:
main_y_test = dt_pipeline.predict(x_test)
r2_main_test = r2_score(y_test, main_y_test)
print('main test score', r2_main_test)

main test score 0.910050108154818


## submission code

In [22]:
# impute_unknown_values(test_pd)
convert_dt_obj_to_datetime(test_pd, 'tpep_pickup_datetime')
test_pd = test_pd.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(test_pd, 'tpep_dropoff_datetime')
test_pd = test_pd.drop('tpep_dropoff_datetime', axis= 1)


In [24]:
test_pd_prediction = dt_pipeline.predict(test_pd)

In [25]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = test_pd_prediction
submission.to_csv('submission.csv', index=False)

In [26]:
submission.describe()

Unnamed: 0,ID,total_amount
count,50000.0,50000.0
mean,25000.5,29.670415
std,14433.901067,25.335577
min,1.0,-576.75
25%,12500.75,16.32
50%,25000.5,21.6
75%,37500.25,32.1
max,50000.0,485.1


In [27]:
submission[submission['total_amount'] < 0]

Unnamed: 0,ID,total_amount
12,13,-4.50
99,100,-42.00
133,134,-14.70
199,200,-14.30
419,420,-4.00
...,...,...
49639,49640,-55.00
49714,49715,-19.25
49741,49742,-14.00
49858,49859,-7.00
