# Texi Guru Kaggle Challange
* Step 1: Data importing

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


##### Reading test data and train data

In [4]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
train_pd.info()


running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

##### Computing the null values and missing values from the tables

In [5]:
train_pd.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-06-28 17:20:21,2023-06-28 16:34:45,1.0,2.14,1.0,N,120,9,Credit Card,2.5,7.165589,0.0,1.0,20.64,2.5,0.0
1,0,2023-06-29 23:05:01,2023-06-29 22:01:35,1.0,2.7,1.0,N,15,215,Credit Card,3.5,6.067401,0.0,1.0,25.55,2.5,0.0
2,1,2023-06-30 10:19:31,2023-06-30 11:13:10,1.0,1.15,1.0,N,167,223,Credit Card,0.0,4.111547,0.0,1.0,17.64,2.5,0.0
3,0,2023-06-29 13:23:09,2023-06-29 14:20:01,1.0,0.4,1.0,N,128,239,Credit Card,2.5,6.411079,0.0,1.0,12.8,2.5,0.0
4,1,2023-06-29 22:03:32,2023-06-29 22:22:22,3.0,1.1,1.0,N,203,52,Credit Card,1.0,4.769377,0.0,1.0,18.0,2.5,0.0


In [6]:
train_pd.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          6077
trip_distance               0
RatecodeID               6077
store_and_fwd_flag       6077
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
total_amount                0
congestion_surcharge     6077
Airport_fee              6077
dtype: int64

In [7]:
train_pd = copy_train_pd.copy()

y = train_pd['total_amount']
X = train_pd.drop('total_amount', axis=1) # seprating prediction element from training set
list_of_features = list(train_pd)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()


## Preprocessing section
* selecting the features for imputation
* transform each feature into same scale
* plotting each feature with respect to each one
* finding the corelation between features

In [9]:
print('Null values in the given training data set is:')
x_train.isna().sum()

Null values in the given training data set is:


VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          4890
trip_distance               0
RatecodeID               4890
store_and_fwd_flag       4890
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
congestion_surcharge     4890
Airport_fee              4890
dtype: int64

#### Checking unique values of nan elements

In [10]:
print(f'''
unique values of passanger_count is: {x_train['passenger_count'].unique()}\n
unique values of RateCodeId is: {x_train['RatecodeID'].unique()}\n
unique values of store_and_fwd_flag is: {x_train['store_and_fwd_flag'].unique()}\n
unique values of congestion_surcharge is: {x_train['congestion_surcharge'].unique()}\n
unqiue values of airport_fees is: {x_train['Airport_fee'].unique()}\n
''')




unique values of passanger_count is: [ 1.  2.  6. nan  4.  3.  5.  0.  8.  9.]

unique values of RateCodeId is: [ 1. nan 99.  2.  5.  3.  4.]

unique values of store_and_fwd_flag is: ['N' nan 'Y']

unique values of congestion_surcharge is: [ 2.5  nan  0.  -2.5]

unqiue values of airport_fees is: [ 0.     nan  1.75 -1.75]




#### Seprating number values and categorical variables

In [11]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()
x_train_cat, x_test_cat = x_train.select_dtypes(exclude='number'), x_test.select_dtypes(exclude='number')
cat_col_names = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'payment_type']
x_train_num = x_train.drop(cat_col_names, axis= 1)
x_test_num = x_train.drop(cat_col_names, axis= 1)
print(f'''Info of cat variables is:\n''')
x_train_cat.info()
print(f'''Info of numerical variables is:\n''')
x_test_num.info()


Info of cat variables is:

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 149626 to 24740
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   tpep_pickup_datetime   140000 non-null  object
 1   tpep_dropoff_datetime  140000 non-null  object
 2   store_and_fwd_flag     135110 non-null  object
 3   payment_type           140000 non-null  object
dtypes: object(4)
memory usage: 5.3+ MB
Info of numerical variables is:

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 149626 to 24740
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               140000 non-null  int64  
 1   passenger_count        135110 non-null  float64
 2   trip_distance          140000 non-null  float64
 3   RatecodeID             135110 non-null  float64
 4   PULocationID           140000 non-null  int64 

### creating one copy for each x_train_cat and x_train_num

In [12]:
x_train_cat_copy, x_train_num_copy = x_train_cat.copy(), x_train_num.copy()
x_test_cat_copy, x_test_num_copy = x_test_cat.copy(), x_test_num.copy()

### categorical values preprocessing

In [23]:
# for reference stroing here will be removed in later stages
'''
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   tpep_pickup_datetime   140000 non-null  object  ***
 1   tpep_dropoff_datetime  140000 non-null  object      Both contain object need to convert in datetime obj ***
 2   store_and_fwd_flag     135118 non-null  object  ***contains null values***
 3   payment_type           140000 non-null  object
'''
# restroing default values for continuous re iteration
x_train_cat, x_test_cat = x_train_cat_copy.copy(), x_test_cat_copy.copy()

# pickup date time
x_train_cat['tpep_pickup_datetime'] = pd.to_datetime(x_train_cat['tpep_pickup_datetime'])
x_test_cat['tpep_pickup_datetime'] = pd.to_datetime(x_test_cat['tpep_pickup_datetime'])

x_train_cat['tpep_dropoff_datetime'] = pd.to_datetime(x_train_cat['tpep_dropoff_datetime'])
x_test_cat['tpep_dropoff_datetime'] = pd.to_datetime(x_test_cat['tpep_dropoff_datetime'])

x_train_cat.info()
x_train_cat = x_train_cat.copy()
x_test_cat = x_test_cat.copy
#


<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 149626 to 24740
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   tpep_pickup_datetime   140000 non-null  datetime64[ns]
 1   tpep_dropoff_datetime  140000 non-null  datetime64[ns]
 2   store_and_fwd_flag     135110 non-null  object        
 3   payment_type           140000 non-null  object        
dtypes: datetime64[ns](2), object(2)
memory usage: 5.3+ MB


#### Converting payment model and flag to string

In [16]:
type(x_train_cat['payment_type'].iloc[1])

str

#### Applying standard scaler in the numberical values

In [37]:
'''
unique values of passanger_count is: [ 1.  2.  6. nan  4.  3.  5.  0.  8.  9.]

unique values of RateCodeId is: [ 1. nan 99.  2.  5.  3.  4.]

unique values of congestion_surcharge is: [ 2.5  nan  0.  -2.5]

unqiue values of airport_fees is: [ 0. nan  1.75 -1.75]
'''
x_train_num = x_train_num_copy.copy()
x_train_num['passenger_count'] = x_train_num['passenger_count'].fillna(0)
##* Reason for filling the -1 is: because we don't know how many passangers are travelling if explicitly mentioned even we are not sure whether anyone travelling or not

x_train_num['RatecodeID'] =  x_train_num['RatecodeID'].fillna(-1)
#* Similarly in case of Rate code it we don't know which rate code is applied during this process

x_train_num['congestion_surcharge'] = x_train_num['congestion_surcharge'].fillna(0)
#* In terms of congestion surcharge we can assure that there is only two possible conditions 
#* either we can have sur charge or we don't have

x_train_num['Airport_fee'] = x_train_num['Airport_fee'].fillna(-10)
#* In rate code id we don't know either airport is applied either that is positive or negative

x_train_num.isna().sum()



VendorID                 0
passenger_count          0
trip_distance            0
RatecodeID               0
PULocationID             0
DOLocationID             0
extra                    0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
congestion_surcharge     0
Airport_fee              0
dtype: int64

#### Appling standard Scaler on numerical data

In [38]:
scaler_pipeline = Pipeline([
('numeric_std_scaler', StandardScaler())
])

x_train_num_scaled = scaler_pipeline.fit_transform(x_train_num)
x_train_num_scaled = pd.DataFrame(x_train_num_scaled)
x_train_num_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.610944,-0.342328,-0.013139,-0.066841,0.082010,-1.490686,-0.991676,0.088690,-0.27825,0.102351,0.367199,0.101378
1,0.610944,-0.342328,-0.009437,-0.066841,-1.007694,0.896688,-0.991676,-0.538880,-0.27825,0.102351,0.367199,0.101378
2,0.610944,0.755481,-0.010343,-0.066841,-0.049280,-0.742992,-0.991676,-0.472352,-0.27825,0.102351,0.367199,0.101378
3,0.610944,-0.342328,-0.011052,-0.066841,-1.138983,1.591913,0.292319,-0.191289,-0.27825,0.102351,0.367199,0.101378
4,0.610944,5.146716,-0.009004,-0.066841,0.370847,-0.926636,0.292319,-0.008828,-0.27825,0.102351,0.367199,0.101378
...,...,...,...,...,...,...,...,...,...,...,...,...
139995,0.610944,-0.342328,-0.010186,-0.066841,0.055752,-0.074002,-0.478078,0.306699,-0.27825,0.102351,0.367199,0.101378
139996,-1.630583,1.853290,-0.015581,-0.066841,-0.928920,0.306403,0.805917,-0.563679,-0.27825,0.102351,0.367199,0.101378
139997,0.610944,-0.342328,-0.013533,-0.066841,-1.651013,-1.320160,0.292319,-0.201955,-0.27825,0.102351,0.367199,0.101378
139998,-1.630583,-0.342328,-0.011643,-0.066841,-0.535051,-1.267690,0.292319,0.047637,-0.27825,0.102351,0.367199,0.101378


In [39]:
# just testing something
l_r = LinearRegression()
l_r.fit(X=x_train_num_scaled, y=y_train)
val = l_r.predict(x_train_num_scaled)
l_r.score(X=x_train_num_scaled, y = y_train)

0.6893928845351508

#### Dropping seprati

### This cell is depricated and will be removed in the future versions

In [10]:
def apply_transformations(data:pd.DataFrame) -> pd.DataFrame:
  pipe_for_fwd_imputr = Pipeline([
    ('simple_imputation', SimpleImputer(strategy='constant', fill_value='null', missing_values=np.nan)),
    ('fwd_one_hot_encoder', OneHotEncoder())
    ])
  pipe_for_payment_methods = Pipeline(
    [
      ('payment_one_hot_encoder', OneHotEncoder())
    ]
  )
  column_transformers = ColumnTransformer([
    ('pass0', 'passthrough', [0]),
    # ('pass1', 'passthrough', [1]),
    # ('pass2', 'passthrough', [2]),
    ('passanger_count_imputer', SimpleImputer(strategy='constant', fill_value=1, missing_values=np.nan), [3]),
    ('pass4', 'passthrough', [4]),
    ('rate_code_id_imputer', SimpleImputer(strategy='constant', fill_value=1.0, missing_values=np.nan), [5]),
    # ('store_and_fwd_flag_imputer',pipe_for_fwd_imputr, [6]),
    ('pass_', 'passthrough', [6]),
    ('pass6', 'passthrough', [7]),
    ('pass8', 'passthrough', [8]),
    ('payment_type_one_hot_encoding', pipe_for_payment_methods, [9]),
    ('pass10', 'passthrough', [10]),
    ('pass11', 'passthrough', [11]),
    ('pass12', 'passthrough', [12]),
    ('pass13', 'passthrough', [13]),
    ('congestion_surcharge_imputer', SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan), [14]),
    ('airport_fee_imputer', SimpleImputer(strategy='constant', fill_value=-5, missing_values=np.nan), [15])
  ])
  # column_pass_through = ColumnTransformer([
    
    
  
  # ])

  feature_pipeline = Pipeline([
    ('column_transformers', column_transformers)
    # ('columns_pass_by', column_pass_through)
  ])
  val = feature_pipeline.fit_transform(data)
  return pd.DataFrame(data=val)
  # return pd.DataFrame()

In [11]:
x_train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee
105012,1,2023-06-29 18:48:52,2023-06-29 19:40:18,1.0,40.84,1.0,N,238,115,Credit Card,7.5,32.619712,0.0,1.0,0.0,1.75
30266,1,2023-06-29 08:47:58,2023-06-29 09:51:29,1.0,9.75,1.0,N,80,134,Credit Card,5.0,12.643525,6.55,1.0,2.5,1.75
43284,1,2023-06-30 17:30:57,2023-06-30 16:49:53,1.0,8.55,1.0,N,77,144,Credit Card,2.5,12.560751,0.0,1.0,2.5,0.0
17479,0,2023-06-29 15:52:07,2023-06-29 16:27:39,2.0,19.1,1.0,Y,241,34,Credit Card,0.0,17.932841,0.0,1.0,0.0,0.0
118173,1,2023-06-29 12:09:23,2023-06-29 12:54:51,1.0,2.19,1.0,N,92,181,Credit Card,0.0,8.051202,0.0,1.0,2.5,0.0


In [12]:

# # df = apply_transformations(train_pd)
# x_train['tpep_pickup_datetime'] = pd.to_datetime(x_train['tpep_pickup_datetime'])
# x_train['tpep_dropoff_datetime'] = pd.to_datetime(x_train['tpep_dropoff_datetime'])

# x_test['tpep_pickup_datetime'] = pd.to_datetime(x_test['tpep_pickup_datetime'])
# x_test['tpep_dropoff_datetime'] = pd.to_datetime(x_test['tpep_dropoff_datetime'])
# x_train = apply_transformations(x_train)
# x_test = apply_transformations(x_train)
 #! generating errors need to check 


In [13]:
x_train_non_number_f = x_train.select_dtypes(exclude="number")
x_train_non_number_f.head()
x_train_copy = x_train.copy()

In [14]:
# x_train = x_train_copy.copy()
# x_train = x_train.drop(['tpep_dropoff_datetime', 'tpep_pickup_datetime'], axis=1)
# x_train = x_train.drop(['store_and_fwd_flag', 'payment_type'], axis=1)
# x_train.info()


### Checking null values

In [15]:
x_train.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          4830
trip_distance               0
RatecodeID               4830
store_and_fwd_flag       4830
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
congestion_surcharge     4830
Airport_fee              4830
dtype: int64

### dummy model for milestone 1

In [16]:
from sklearn.dummy import DummyRegressor

In [17]:
dummy_regressor = DummyRegressor(strategy= 'mean')

In [18]:
dummy_regressor.fit(x_train, y_train)

In [19]:
dummy_regressor.score(x_train, y_train)

0.0

In [20]:
dummy_regressor.score(x_test, y_test)

-3.4199641225196586e-05

In [23]:
dummy_output = dummy_regressor.predict(x_test)
dummy_output = pd.DataFrame(dummy_output)

In [24]:
# dummy_output.to_csv('submission.csv')

In [31]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = [i for i in range(50000)]
submission.to_csv('submission.csv', index=False)