# Texi Guru Kaggle Challange
* Step 1: Data importing

In [107]:
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

##### Reading test data and train data

In [3]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
train_pd.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amount           175000 non-nu

##### Computing the null values and missing values from the tables

In [3]:
train_pd.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-06-28 17:20:21,2023-06-28 16:34:45,1.0,2.14,1.0,N,120,9,Credit Card,2.5,7.165589,0.0,1.0,20.64,2.5,0.0
1,0,2023-06-29 23:05:01,2023-06-29 22:01:35,1.0,2.7,1.0,N,15,215,Credit Card,3.5,6.067401,0.0,1.0,25.55,2.5,0.0
2,1,2023-06-30 10:19:31,2023-06-30 11:13:10,1.0,1.15,1.0,N,167,223,Credit Card,0.0,4.111547,0.0,1.0,17.64,2.5,0.0
3,0,2023-06-29 13:23:09,2023-06-29 14:20:01,1.0,0.4,1.0,N,128,239,Credit Card,2.5,6.411079,0.0,1.0,12.8,2.5,0.0
4,1,2023-06-29 22:03:32,2023-06-29 22:22:22,3.0,1.1,1.0,N,203,52,Credit Card,1.0,4.769377,0.0,1.0,18.0,2.5,0.0


In [4]:
train_pd.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          6077
trip_distance               0
RatecodeID               6077
store_and_fwd_flag       6077
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
total_amount                0
congestion_surcharge     6077
Airport_fee              6077
dtype: int64

In [5]:
train_pd = copy_train_pd.copy()

y = train_pd['total_amount']
X = train_pd.drop('total_amount', axis=1) # seprating prediction element from training set
list_of_features = list(train_pd)


In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()


## Preprocessing section
* selecting the features for imputation
* transform each feature into same scale
* plotting each feature with respect to each one
* finding the corelation between features

In [7]:
print('Null values in the given training data set is:')
x_train.isna().sum()

Null values in the given training data set is:


VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          4847
trip_distance               0
RatecodeID               4847
store_and_fwd_flag       4847
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
congestion_surcharge     4847
Airport_fee              4847
dtype: int64

#### Checking unique values of nan elements

In [34]:
x_train_copy.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee
12908,0,2023-06-29 09:49:27,2023-06-29 10:22:45,1.0,4.6,1.0,N,114,199,Credit Card,2.5,6.485206,0.0,1.0,2.5,0.0
158656,1,2023-06-30 18:26:45,2023-06-30 18:38:32,1.0,9.5,1.0,N,106,69,Credit Card,7.5,7.146033,6.55,1.0,2.5,1.75
151212,1,2023-06-29 19:01:43,2023-06-29 17:37:25,4.0,4.13,1.0,N,152,155,Credit Card,2.5,15.676634,0.0,1.0,2.5,0.0
48474,1,2023-06-28 23:25:44,2023-06-28 22:59:34,2.0,1.15,1.0,N,91,128,Cash,1.0,4.002889,0.0,1.0,2.5,0.0
164015,0,2023-06-28 18:38:47,2023-06-28 18:49:41,1.0,1.7,1.0,N,6,259,Credit Card,5.0,6.990341,0.0,1.0,2.5,0.0


In [8]:
print(f'''
unique values of passanger_count is: {x_train['passenger_count'].unique()}\n
unique values of RateCodeId is: {x_train['RatecodeID'].unique()}\n
unique values of store_and_fwd_flag is: {x_train['store_and_fwd_flag'].unique()}\n
unique values of congestion_surcharge is: {x_train['congestion_surcharge'].unique()}\n
unqiue values of airport_fees is: {x_train['Airport_fee'].unique()}\n
''')




unique values of passanger_count is: [ 1.  4.  2.  5.  3. nan  0.  6.  8.  9.]

unique values of RateCodeId is: [ 1.  2. nan  4.  5. 99.  3.]

unique values of store_and_fwd_flag is: ['N' nan 'Y']

unique values of congestion_surcharge is: [ 2.5  0.   nan -2.5]

unqiue values of airport_fees is: [ 0.    1.75   nan -1.75]




#### Seprating number values and categorical variables

### Creating pipelines for data processing

##### converting datetime into date time object

In [35]:
def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
#     df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS

In [36]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()
  
convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')
x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 12908 to 59718
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   VendorID                     140000 non-null  int64  
 1   passenger_count              135153 non-null  float64
 2   trip_distance                140000 non-null  float64
 3   RatecodeID                   135153 non-null  float64
 4   store_and_fwd_flag           135153 non-null  object 
 5   PULocationID                 140000 non-null  int64  
 6   DOLocationID                 140000 non-null  int64  
 7   payment_type                 140000 non-null  object 
 8   extra                        140000 non-null  float64
 9   tip_amount                   140000 non-null  float64
 10  tolls_amount                 140000 non-null  float64
 11  improvement_surcharge        140000 non-null  float64
 12  congestion_surcharge         135153 non-null  float64
 13  A


Creating pipeline for remaining variable

In [101]:
# tip_amount = Pipeline([
#     ('standard_scaler', StandardScaler())
# ])

congestion_charger_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
#     ('standard_scaler', StandardScaler())
])

airport_fee_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
#     ('standard_scaler', StandardScaler())
])

# extra_pipeline = Pipeline([
#     ('standard_scaler', StandardScaler())
# ])

# pu_location_pipe = Pipeline([
#     ('standard_scaler', StandardScaler())
# ])
# do_location_pipe = Pipeline([
#     ('standard_scaler', StandardScaler())
# ])


rate_code_id_pipe = Pipeline([
     ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
#     ('standard_scaler', StandardScaler())
])
store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    ('one_hot_encoder', OneHotEncoder())
])
payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])
passanger_count_pipe = Pipeline([
    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
#     ('standard_scaler', StandardScaler())
])



creating column transformers

In [102]:
'''
0    VendorID                     140000 non-null  int64  
 1   passenger_count              135153 non-null  float64
 2   trip_distance                140000 non-null  float64
 3   RatecodeID                   135153 non-null  float64
 4   store_and_fwd_flag           135153 non-null  object 
 5   PULocationID                 140000 non-null  int64  
 6   DOLocationID                 140000 non-null  int64  
 7   payment_type                 140000 non-null  object 
 8   extra                        140000 non-null  float64
 9   tip_amount                   140000 non-null  float64
 10  tolls_amount                 140000 non-null  float64
 11  improvement_surcharge        140000 non-null  float64
 12  congestion_surcharge         135153 non-null  float64
 13  Airport_fee                  135153 non-null  float64
 14  tpep_pickup_datetime_Year    140000 non-null  int64  
 15  tpep_pickup_datetime_Month   140000 non-null  int64  
 16  tpep_pickup_datetime_Day     140000 non-null  int64  
 17  tpep_pickup_datetime_Hour    140000 non-null  int64  
 18  tpep_dropoff_datetime_Year   140000 non-null  int64  
 19  tpep_dropoff_datetime_Month  140000 non-null  int64  
 20  tpep_dropoff_datetime_Day    140000 non-null  int64  
 21  tpep_dropoff_datetime_Hour   140000 non-null  int64
'''


main_pipeline = ColumnTransformer([
#     ('vendor_id',  StandardScaler(), "VendorID"),
    ("passanger_count_t", passanger_count_pipe, ["passenger_count"]),
    ('trip_distance_t', StandardScaler(), ['trip_distance']),
    ('rate_code_id', rate_code_id_pipe, ['RatecodeID'] ),
    ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
    ('pu_loc', StandardScaler(), ['PULocationID']),
    ('du_loc', StandardScaler(), ["DOLocationID"]),
    ('payment_t', payment_type_pipe, ["payment_type"]),
    ('extra_t', StandardScaler(), ['extra']),
    ('tip_amoun_t', StandardScaler(), ['tip_amount']),
    ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
    ('improvement_c', StandardScaler(), ['improvement_surcharge']),
    ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
    ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
    ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder= 'passthrough' )
main_pipeline

In [103]:
main_pipeline.fit(x_train)
x_train_t = main_pipeline.transform(x_train)
x_train_t = pd.DataFrame(x_train_t)
print(x_train_t.isna().sum())

x_train_t.head()
# x_train.info()


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1.0,-0.002073,1.0,1.0,0.0,-0.246042,0.869033,0.0,1.0,0.0,...,0.0,0.0,-0.065935,-0.029531,-1.059013,0.0,-0.087925,0.011092,-0.856753,0.0
1,1.0,0.009023,1.0,1.0,0.0,-0.351104,-0.834936,0.0,1.0,0.0,...,1.75,0.0,-0.065935,0.472696,0.499585,0.0,-0.087925,0.400777,0.48703,1.0
2,4.0,-0.003138,1.0,1.0,0.0,0.253002,0.292305,0.0,1.0,0.0,...,0.0,0.0,-0.065935,-0.029531,0.672762,0.0,-0.087925,0.011092,0.319057,1.0
3,2.0,-0.009886,1.0,1.0,0.0,-0.548095,-0.061597,1.0,0.0,0.0,...,0.0,0.0,-0.065935,-0.531758,1.365473,0.0,-0.087925,-0.378593,1.158922,1.0
4,1.0,-0.008641,1.0,1.0,0.0,-1.664378,1.65548,0.0,1.0,0.0,...,0.0,0.0,-0.065935,-0.531758,0.499585,0.0,-0.087925,-0.378593,0.48703,0.0


# checking data using leanear regression

In [110]:
lr = LinearRegression()
lr.fit(X=x_train_t, y= y_train)
print('mse after basic transformation is:', mean_squared_error(y_train, lr.predict(x_train_t)))
print('train score is:', lr.score(x_train_t, y_train))

mse after basic transformation is: 173.83896319881947
train score is: 0.7311730825173418


test score

In [114]:
x_test_t = main_pipeline.fit_transform(x_test)
x_test_t = pd.DataFrame(x_test_t)
x_test_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       35000 non-null  float64
 1   1       35000 non-null  float64
 2   2       35000 non-null  float64
 3   3       35000 non-null  float64
 4   4       35000 non-null  float64
 5   5       35000 non-null  float64
 6   6       35000 non-null  float64
 7   7       35000 non-null  float64
 8   8       35000 non-null  float64
 9   9       35000 non-null  float64
 10  10      35000 non-null  float64
 11  11      35000 non-null  float64
 12  12      35000 non-null  float64
 13  13      35000 non-null  float64
 14  14      35000 non-null  float64
 15  15      35000 non-null  float64
 16  16      35000 non-null  float64
 17  17      35000 non-null  float64
 18  18      35000 non-null  float64
 19  19      35000 non-null  float64
 20  20      35000 non-null  float64
 21  21      35000 non-null  float64
 22

In [115]:
print('test mean squared error is:', mean_squared_error(y_test, lr.predict(x_test_t)))
print('test score is:', lr.score(x_test_t, y_test))

test mean squared error is: 184.34077825723705
test score is: 0.7144462494721564


## BELOW THING POINT EVERYTHING WILL BE REMOVED

In [10]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()
x_train_cat, x_test_cat = x_train.select_dtypes(exclude='number'), x_test.select_dtypes(exclude='number')
cat_col_names = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'payment_type']
x_train_num = x_train.drop(cat_col_names, axis= 1)
x_test_num = x_train.drop(cat_col_names, axis= 1)
print(f'''Info of cat variables is:\n''')
x_train_cat.info()
print(f'''Info of numerical variables is:\n''')
x_test_num.info()


Info of cat variables is:

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 12908 to 59718
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   tpep_pickup_datetime   140000 non-null  object
 1   tpep_dropoff_datetime  140000 non-null  object
 2   store_and_fwd_flag     135153 non-null  object
 3   payment_type           140000 non-null  object
dtypes: object(4)
memory usage: 5.3+ MB
Info of numerical variables is:

<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 12908 to 59718
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               140000 non-null  int64  
 1   passenger_count        135153 non-null  float64
 2   trip_distance          140000 non-null  float64
 3   RatecodeID             135153 non-null  float64
 4   PULocationID           140000 non-null  int64  


### creating one copy for each x_train_cat and x_train_num

!!!! Depricated columns

In [11]:
x_train_cat_copy, x_train_num_copy = x_train_cat.copy(), x_train_num.copy()
x_test_cat_copy, x_test_num_copy = x_test_cat.copy(), x_test_num.copy()

### Creating pipeline for the categorical varaibles

In [12]:
date_time_df = pd.DataFrame()


### categorical values preprocessing

In [16]:
# for reference stroing here will be removed in later stages
'''
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   tpep_pickup_datetime   140000 non-null  object  ***
 1   tpep_dropoff_datetime  140000 non-null  object      Both contain object need to convert in datetime obj ***
 2   store_and_fwd_flag     135118 non-null  object  ***contains null values***
 3   payment_type           140000 non-null  object
'''
# restroing default values for continuous re iteration
x_train_cat, x_test_cat = x_train_cat_copy.copy(), x_test_cat_copy.copy()

# pickup date time
x_train_cat['tpep_pickup_datetime'] = pd.to_datetime(x_train_cat['tpep_pickup_datetime'])
x_test_cat['tpep_pickup_datetime'] = pd.to_datetime(x_test_cat['tpep_pickup_datetime'])

x_train_cat.info()
#


<class 'pandas.core.frame.DataFrame'>
Index: 140000 entries, 12908 to 59718
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   tpep_pickup_datetime   140000 non-null  datetime64[ns]
 1   tpep_dropoff_datetime  140000 non-null  object        
 2   store_and_fwd_flag     135153 non-null  object        
 3   payment_type           140000 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 5.3+ MB


#### Dropping seprati

### This cell is depricated and will be removed in the future versions

In [10]:
def apply_transformations(data:pd.DataFrame) -> pd.DataFrame:
  pipe_for_fwd_imputr = Pipeline([
    ('simple_imputation', SimpleImputer(strategy='constant', fill_value='null', missing_values=np.nan)),
    ('fwd_one_hot_encoder', OneHotEncoder())
    ])
  pipe_for_payment_methods = Pipeline(
    [
      ('payment_one_hot_encoder', OneHotEncoder())
    ]
  )
  column_transformers = ColumnTransformer([
    ('pass0', 'passthrough', [0]),
    # ('pass1', 'passthrough', [1]),
    # ('pass2', 'passthrough', [2]),
    ('passanger_count_imputer', SimpleImputer(strategy='constant', fill_value=1, missing_values=np.nan), [3]),
    ('pass4', 'passthrough', [4]),
    ('rate_code_id_imputer', SimpleImputer(strategy='constant', fill_value=1.0, missing_values=np.nan), [5]),
    # ('store_and_fwd_flag_imputer',pipe_for_fwd_imputr, [6]),
    ('pass_', 'passthrough', [6]),
    ('pass6', 'passthrough', [7]),
    ('pass8', 'passthrough', [8]),
    ('payment_type_one_hot_encoding', pipe_for_payment_methods, [9]),
    ('pass10', 'passthrough', [10]),
    ('pass11', 'passthrough', [11]),
    ('pass12', 'passthrough', [12]),
    ('pass13', 'passthrough', [13]),
    ('congestion_surcharge_imputer', SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan), [14]),
    ('airport_fee_imputer', SimpleImputer(strategy='constant', fill_value=-5, missing_values=np.nan), [15])
  ])
  # column_pass_through = ColumnTransformer([
    
    
  
  # ])

  feature_pipeline = Pipeline([
    ('column_transformers', column_transformers)
    # ('columns_pass_by', column_pass_through)
  ])
  val = feature_pipeline.fit_transform(data)
  return pd.DataFrame(data=val)
  # return pd.DataFrame()

In [11]:
x_train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,extra,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,Airport_fee
105012,1,2023-06-29 18:48:52,2023-06-29 19:40:18,1.0,40.84,1.0,N,238,115,Credit Card,7.5,32.619712,0.0,1.0,0.0,1.75
30266,1,2023-06-29 08:47:58,2023-06-29 09:51:29,1.0,9.75,1.0,N,80,134,Credit Card,5.0,12.643525,6.55,1.0,2.5,1.75
43284,1,2023-06-30 17:30:57,2023-06-30 16:49:53,1.0,8.55,1.0,N,77,144,Credit Card,2.5,12.560751,0.0,1.0,2.5,0.0
17479,0,2023-06-29 15:52:07,2023-06-29 16:27:39,2.0,19.1,1.0,Y,241,34,Credit Card,0.0,17.932841,0.0,1.0,0.0,0.0
118173,1,2023-06-29 12:09:23,2023-06-29 12:54:51,1.0,2.19,1.0,N,92,181,Credit Card,0.0,8.051202,0.0,1.0,2.5,0.0


In [12]:

# # df = apply_transformations(train_pd)
# x_train['tpep_pickup_datetime'] = pd.to_datetime(x_train['tpep_pickup_datetime'])
# x_train['tpep_dropoff_datetime'] = pd.to_datetime(x_train['tpep_dropoff_datetime'])

# x_test['tpep_pickup_datetime'] = pd.to_datetime(x_test['tpep_pickup_datetime'])
# x_test['tpep_dropoff_datetime'] = pd.to_datetime(x_test['tpep_dropoff_datetime'])
# x_train = apply_transformations(x_train)
# x_test = apply_transformations(x_train)
 #! generating errors need to check 


In [13]:
x_train_non_number_f = x_train.select_dtypes(exclude="number")
x_train_non_number_f.head()
x_train_copy = x_train.copy()

In [14]:
# x_train = x_train_copy.copy()
# x_train = x_train.drop(['tpep_dropoff_datetime', 'tpep_pickup_datetime'], axis=1)
# x_train = x_train.drop(['store_and_fwd_flag', 'payment_type'], axis=1)
# x_train.info()


### Checking null values

In [15]:
x_train.isna().sum()

VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count          4830
trip_distance               0
RatecodeID               4830
store_and_fwd_flag       4830
PULocationID                0
DOLocationID                0
payment_type                0
extra                       0
tip_amount                  0
tolls_amount                0
improvement_surcharge       0
congestion_surcharge     4830
Airport_fee              4830
dtype: int64

### dummy model for milestone 1

In [16]:
from sklearn.dummy import DummyRegressor

In [17]:
dummy_regressor = DummyRegressor(strategy= 'mean')

In [18]:
dummy_regressor.fit(x_train, y_train)

In [19]:
dummy_regressor.score(x_train, y_train)

0.0

In [20]:
dummy_regressor.score(x_test, y_test)

-3.4199641225196586e-05

In [23]:
dummy_output = dummy_regressor.predict(x_test)
dummy_output = pd.DataFrame(dummy_output)

In [24]:
# dummy_output.to_csv('submission.csv')

In [31]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = [i for i in range(50000)]
submission.to_csv('submission.csv', index=False)