# Texi Guru Kaggle Challange
* Step 1: Data importing

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer


##### Reading test data and train data

In [2]:
train_pd = pd.read_csv('train.csv')
copy_train_pd = train_pd.copy()
test_pd = pd.read_csv('test.csv')
copy_test_pd = test_pd.copy()
train_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amount           175000 non-nu

##### Computing the null values and missing values from the tables

In [3]:
train_pd

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-06-28 17:20:21,2023-06-28 16:34:45,1.0,2.14,1.0,N,120,9,Credit Card,2.5,7.165589,0.0,1.0,20.64,2.5,0.00
1,0,2023-06-29 23:05:01,2023-06-29 22:01:35,1.0,2.70,1.0,N,15,215,Credit Card,3.5,6.067401,0.0,1.0,25.55,2.5,0.00
2,1,2023-06-30 10:19:31,2023-06-30 11:13:10,1.0,1.15,1.0,N,167,223,Credit Card,0.0,4.111547,0.0,1.0,17.64,2.5,0.00
3,0,2023-06-29 13:23:09,2023-06-29 14:20:01,1.0,0.40,1.0,N,128,239,Credit Card,2.5,6.411079,0.0,1.0,12.80,2.5,0.00
4,1,2023-06-29 22:03:32,2023-06-29 22:22:22,3.0,1.10,1.0,N,203,52,Credit Card,1.0,4.769377,0.0,1.0,18.00,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174995,1,2023-06-30 22:50:57,2023-06-30 22:22:22,3.0,3.45,1.0,N,147,167,Credit Card,1.0,8.732495,0.0,1.0,28.08,2.5,0.00
174996,1,2023-06-30 13:03:33,2023-06-30 14:04:57,1.0,9.44,1.0,N,154,191,Cash,5.0,0.283275,0.0,1.0,59.95,2.5,1.75
174997,0,2023-06-29 11:03:32,2023-06-29 12:13:34,1.0,2.40,1.0,N,168,106,Credit Card,2.5,4.245354,0.0,1.0,33.50,2.5,0.00
174998,1,2023-06-29 19:47:17,2023-06-29 19:08:55,1.0,4.71,1.0,N,240,100,Credit Card,2.5,10.479776,0.0,1.0,40.80,2.5,0.00


In [4]:
train_pd.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          6077
trip_distance               0
RatecodeID               6077
store_and_fwd_flag       6077
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
total_amount                0
congestion_surcharge     6077
Airport_fee              6077
dtype: int64

In [21]:
train_pd = copy_train_pd.copy()

y_train = train_pd['total_amount']
x_train = train_pd.drop('total_amount', axis=1) # seprating prediction element from training set
list_of_features = list(train_pd)
y_train

0         20.64
1         25.55
2         17.64
3         12.80
4         18.00
          ...  
174995    28.08
174996    59.95
174997    33.50
174998    40.80
174999    16.32
Name: total_amount, Length: 175000, dtype: float64

## Preprocessing section
* selecting the features for imputation
* transform each feature into same scale
* plotting each feature with respect to each one
* finding the corelation between features

In [8]:
print('Null values in the given training data set is:')
train_pd.isna().sum()

Null values in the given training data set is:


VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          6077
trip_distance               0
RatecodeID               6077
store_and_fwd_flag       6077
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
total_amount                0
congestion_surcharge     6077
Airport_fee              6077
dtype: int64

In [16]:
train_pd['RatecodeID'].median()
train_pd['Airport_fee'].unique()

array([ 0.  ,  1.75,   nan, -1.75])

So, in current transformer we are assuing if there is no fwd flag available then we data is not forwarded
Similary in case of congestion surcharge and Airport fees we are replacing ever NAN value with 0

In [23]:
column_transformers = ColumnTransformer([
  ('pass0', 'passthrough', [0]),
  ('pass1', 'passthrough', [1]),
  ('pass2', 'passthrough', [2]),
  ('passanger_count_imputer', SimpleImputer(strategy='constant', fill_value=1, missing_values=np.nan), [3]),
  ('pass4', 'passthrough', [4]),
  ('rate_code_id_imputer', SimpleImputer(strategy='constant', fill_value=1.0, missing_values=np.nan), [5]),
  ('store_and_fwd_flag_imputer', SimpleImputer(strategy='constant', fill_value='N', missing_values=np.nan), [6]),
   ('pass6', 'passthrough', [7]),
  ('pass8', 'passthrough', [8]),
  ('pass9', 'passthrough', [9]),
  ('pass10', 'passthrough', [10]),
  ('pass11', 'passthrough', [11]),
  ('pass12', 'passthrough', [12]),
  ('pass13', 'passthrough', [13]),
  ('pass14', 'passthrough', [14]),
  ('congestion_surcharge_imputer', SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan), [15]),
  ('airport_fee_imputer', SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan), [16])
])
# column_pass_through = ColumnTransformer([
  
  
 
# ])

feature_pipeline = Pipeline([
  ('column_transformers', column_transformers)
  # ('columns_pass_by', column_pass_through)
])
val = feature_pipeline.fit_transform(train_pd)


ValueError: all features must be in [0, 4] or [-5, 0]

In [22]:
transformed_train_pd = pd.DataFrame(data=val, columns=list_of_features)

ValueError: Shape of passed values is (175000, 5), indices imply (175000, 17)