In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV

from sklearn.svm import SVR

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA



In [2]:
try: # For kaggle
  train_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')

  test_pd = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv')
  print('running on kaggle')
except: #for local machine 
  print('running on local machine')
  train_pd = pd.read_csv('train.csv')
  test_pd = pd.read_csv('test.csv')
  pass
copy_train_pd = train_pd.copy()
copy_test_pd = test_pd.copy()
copy_train_pd.info()


running on local machine
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               175000 non-null  int64  
 1   tpep_pickup_datetime   175000 non-null  object 
 2   tpep_dropoff_datetime  175000 non-null  object 
 3   passenger_count        168923 non-null  float64
 4   trip_distance          175000 non-null  float64
 5   RatecodeID             168923 non-null  float64
 6   store_and_fwd_flag     168923 non-null  object 
 7   PULocationID           175000 non-null  int64  
 8   DOLocationID           175000 non-null  int64  
 9   payment_type           175000 non-null  object 
 10  extra                  175000 non-null  float64
 11  tip_amount             175000 non-null  float64
 12  tolls_amount           175000 non-null  float64
 13  improvement_surcharge  175000 non-null  float64
 14  total_amoun

In [3]:
def convert_dt_obj_to_datetime(df:pd.DataFrame, col_name:str):
    df[col_name] = pd.to_datetime(df[col_name])
    df[col_name +'_Year'] = df[col_name].apply(lambda time: time.year)
    df[col_name +'_Month'] = df[col_name].apply(lambda time: time.month)
    df[col_name +'_Day'] = df[col_name].apply(lambda time: time.day)
    df[col_name +'_Hour'] = df[col_name].apply(lambda time: time.hour)
    df[col_name +'_Minute'] = df[col_name].apply(lambda time: time.minute) # IGNORING MINS

In [4]:
x_train, x_test = train_test_split(train_pd, test_size= 0.1)
x_train_copy = x_train.copy()
x_test_copy = x_test.copy()

In [5]:
x_train, x_test = x_train_copy.copy(), x_test_copy.copy()

convert_dt_obj_to_datetime(x_train, 'tpep_pickup_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_pickup_datetime')
x_train = x_train.drop('tpep_pickup_datetime', axis= 1)
x_test = x_test.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(x_train, 'tpep_dropoff_datetime')
convert_dt_obj_to_datetime(x_test, 'tpep_dropoff_datetime')

x_train = x_train.drop('tpep_dropoff_datetime', axis= 1)
x_test = x_test.drop('tpep_dropoff_datetime', axis= 1)

x_train_copy, x_test_copy = x_train.copy(), x_test.copy()

In [6]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152019.0,157500.0,152019.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728832,1.356567,5.307728,1.523014,132.742813,132.754248,1.932537,6.127801,0.645602,0.979611,...,2023.0,6.004317,29.059149,15.121917,29.506603,2023.0,6.007911,28.964546,15.103054,29.49659
std,0.445406,0.888557,416.33265,6.55224,76.158686,76.240392,1.947389,4.635571,2.32113,0.199129,...,0.0,0.065566,1.989044,5.772616,17.309795,0.0,0.088592,2.6025,5.952948,17.33057
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,0.000129,-26.55,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472856,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.84,1.0,133.0,133.0,1.0,5.289345,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.62,1.0,199.0,199.0,2.5,7.499043,0.0,1.0,...,2023.0,6.0,30.0,20.0,45.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [7]:
x_train.isna().sum()

VendorID                           0
passenger_count                 5481
trip_distance                      0
RatecodeID                      5481
store_and_fwd_flag              5481
PULocationID                       0
DOLocationID                       0
payment_type                       0
extra                              0
tip_amount                         0
tolls_amount                       0
improvement_surcharge              0
total_amount                       0
congestion_surcharge            5481
Airport_fee                     5481
tpep_pickup_datetime_Year          0
tpep_pickup_datetime_Month         0
tpep_pickup_datetime_Day           0
tpep_pickup_datetime_Hour          0
tpep_pickup_datetime_Minute        0
tpep_dropoff_datetime_Year         0
tpep_dropoff_datetime_Month        0
tpep_dropoff_datetime_Day          0
tpep_dropoff_datetime_Hour         0
tpep_dropoff_datetime_Minute       0
dtype: int64

In [8]:
def impute_unknown_values(X:pd.DataFrame):
  X['passenger_count'] = X['passenger_count'].replace(np.nan, 1)
  X['RatecodeID'] = X['RatecodeID'].replace(np.nan, 1)
  X['store_and_fwd_flag'] = X['store_and_fwd_flag'].replace(np.nan, 'N')
  X['congestion_surcharge'] = X['congestion_surcharge'].replace(np.nan, 0)
  X['Airport_fee'] = X['Airport_fee'].replace(np.nan, 0)
  print(X.isna().sum())

In [9]:
# x_train['passenger_count'] = x_train['passenger_count'].replace(np.nan, 1)
# x_train['RatecodeID'] = x_train['RatecodeID'].replace(np.nan, 1)
# x_train['store_and_fwd_flag'] = x_train['store_and_fwd_flag'].replace(np.nan, 'N')
# x_train['congestion_surcharge'] = x_train['congestion_surcharge'].replace(np.nan, 0)
# x_train['Airport_fee'] = x_train['Airport_fee'].replace(np.nan, 0)
x_train.isna().sum()

VendorID                           0
passenger_count                 5481
trip_distance                      0
RatecodeID                      5481
store_and_fwd_flag              5481
PULocationID                       0
DOLocationID                       0
payment_type                       0
extra                              0
tip_amount                         0
tolls_amount                       0
improvement_surcharge              0
total_amount                       0
congestion_surcharge            5481
Airport_fee                     5481
tpep_pickup_datetime_Year          0
tpep_pickup_datetime_Month         0
tpep_pickup_datetime_Day           0
tpep_pickup_datetime_Hour          0
tpep_pickup_datetime_Minute        0
tpep_dropoff_datetime_Year         0
tpep_dropoff_datetime_Month        0
tpep_dropoff_datetime_Day          0
tpep_dropoff_datetime_Hour         0
tpep_dropoff_datetime_Minute       0
dtype: int64

### OUTLIER removal

In [10]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152019.0,157500.0,152019.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728832,1.356567,5.307728,1.523014,132.742813,132.754248,1.932537,6.127801,0.645602,0.979611,...,2023.0,6.004317,29.059149,15.121917,29.506603,2023.0,6.007911,28.964546,15.103054,29.49659
std,0.445406,0.888557,416.33265,6.55224,76.158686,76.240392,1.947389,4.635571,2.32113,0.199129,...,0.0,0.065566,1.989044,5.772616,17.309795,0.0,0.088592,2.6025,5.952948,17.33057
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,0.000129,-26.55,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472856,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.84,1.0,133.0,133.0,1.0,5.289345,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.62,1.0,199.0,199.0,2.5,7.499043,0.0,1.0,...,2023.0,6.0,30.0,20.0,45.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [11]:
print('total number of airport fees negative', len(x_train[x_train['Airport_fee'] < 0]))
print('total congestion charge with negative value', len(x_train[x_train['congestion_surcharge'] < 0]))
print('total number of improvement charge in -ve', len(x_train[x_train['improvement_surcharge'] < 0]))

total number of airport fees negative 235
total congestion charge with negative value 1249
total number of improvement charge in -ve 1558


In [12]:
# x_train = x_train[x_train['passenger_count'] <= 6] ## Passanger count filtering
# x_train = x_train[x_train['trip_distance'] <= 30] ## trip distance filtering highly affecting
# x_train = x_train[x_train['tolls_amount'] >=0] # filtering tolls amount
# x_train = x_train[x_train['Airport_fee'] >= 0]
# x_train = x_train[x_train['congestion_surcharge'] >= 0]
# x_train = x_train[x_train['improvement_surcharge'] >= 0] # removing improvement surcharge
# x_train = x_train[x_train['tip_amount'] <= 100] # removing extra tip amount
# x_train = x_train[x_train['total_amount'] <= 150]
##* Without outlier removal i am scoring 0.75


In [13]:
print('number of extra elements with neg elements', len(x_train[x_train.tolls_amount < 0]))
print('improvement surcharge', len(x_train[x_train['improvement_surcharge'] < 0]))

number of extra elements with neg elements 113
improvement surcharge 1558


In [14]:
x_train.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,extra,tip_amount,tolls_amount,improvement_surcharge,...,tpep_pickup_datetime_Year,tpep_pickup_datetime_Month,tpep_pickup_datetime_Day,tpep_pickup_datetime_Hour,tpep_pickup_datetime_Minute,tpep_dropoff_datetime_Year,tpep_dropoff_datetime_Month,tpep_dropoff_datetime_Day,tpep_dropoff_datetime_Hour,tpep_dropoff_datetime_Minute
count,157500.0,152019.0,157500.0,152019.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,...,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0,157500.0
mean,0.728832,1.356567,5.307728,1.523014,132.742813,132.754248,1.932537,6.127801,0.645602,0.979611,...,2023.0,6.004317,29.059149,15.121917,29.506603,2023.0,6.007911,28.964546,15.103054,29.49659
std,0.445406,0.888557,416.33265,6.55224,76.158686,76.240392,1.947389,4.635571,2.32113,0.199129,...,0.0,0.065566,1.989044,5.772616,17.309795,0.0,0.088592,2.6025,5.952948,17.33057
min,0.0,0.0,0.0,1.0,1.0,1.0,-7.5,0.000129,-26.55,-1.0,...,2023.0,6.0,1.0,0.0,0.0,2023.0,6.0,1.0,0.0,0.0
25%,0.0,1.0,1.08,1.0,67.0,67.0,0.0,3.472856,0.0,1.0,...,2023.0,6.0,29.0,11.0,15.0,2023.0,6.0,29.0,11.0,14.0
50%,1.0,1.0,1.84,1.0,133.0,133.0,1.0,5.289345,0.0,1.0,...,2023.0,6.0,29.0,16.0,30.0,2023.0,6.0,29.0,17.0,30.0
75%,1.0,1.0,3.62,1.0,199.0,199.0,2.5,7.499043,0.0,1.0,...,2023.0,6.0,30.0,20.0,45.0,2023.0,6.0,30.0,20.0,45.0
max,2.0,9.0,135182.06,99.0,264.0,264.0,11.75,484.876151,80.0,1.0,...,2023.0,7.0,30.0,23.0,59.0,2023.0,7.0,30.0,23.0,59.0


In [15]:
print(len(x_train[x_train['trip_distance'] > 20]))
print(len(x_train[x_train['tip_amount'] > 100]))
print('total amount greater than 300', len(x_train[x_train['total_amount'] > 125]))

2203
7
total amount greater than 300 912


In [16]:
x_train['VendorID'].unique()

array([0, 1, 2])

In [17]:
vendor_id_pipe = Pipeline([
  ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])
passanger_count_pipe = Pipeline([
    # ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 1)),
    ('knn_imputer', KNNImputer(missing_values= np.nan, n_neighbors= 5)),
    ('std_scaler', StandardScaler())
])
rate_code_id_pipe = Pipeline([
    #    ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant',
    #                                      fill_value= 1)),
        ('knn_imputer', KNNImputer(missing_values= np.nan, n_neighbors= 5)),
       ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')) ## TODO: experimental changes
])
store_and_fwd_pipe = Pipeline([
    ('simple_immmputer', SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='N')),
    # ('knn_imputer', KNNImputer(missing_values= np.nan, n_neighbors= 5)),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

congestion_charger_pipe = Pipeline([
    # ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('knn_imputer', KNNImputer(missing_values= np.nan, n_neighbors= 5)),
    ('std_scaler', StandardScaler())
])
airport_fee_pipe = Pipeline([
    # ('simple_imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value= 0)),
    ('knn_imputer', KNNImputer(missing_values= np.nan, n_neighbors= 5)),
    ('std_scale', StandardScaler())
])
payment_type_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [18]:
feature_scaling_and_trans = ColumnTransformer([
  ('vendor_id', vendor_id_pipe, ['VendorID']),
  ('passanger_count', passanger_count_pipe, ['passenger_count']),
  ('rate_code_id', rate_code_id_pipe, ['RatecodeID']), # experimental changes
  ('s_nd_f_flag', store_and_fwd_pipe, ['store_and_fwd_flag']),
  ('pu_loc', StandardScaler(), ['PULocationID']),
  ('du_loc', StandardScaler(), ["DOLocationID"]),
  ('payment_t', payment_type_pipe, ['payment_type']),
  ('extra_t', StandardScaler(), ['extra']),
  ('tip_amount_t', StandardScaler(), ['tip_amount']),
  ('tolls_amount_t', StandardScaler(), ["tolls_amount"]),
  ('improvement_c', StandardScaler(), ['improvement_surcharge']),
  ('cong_charge', congestion_charger_pipe, ['congestion_surcharge']),
  ('Airport_fee_t', airport_fee_pipe, ['Airport_fee']),
  ('std_scaler', StandardScaler(), [i for i in range(14, 22)])
], remainder= 'passthrough')

In [19]:
linear_reg = LinearRegression()


In [20]:
y = x_train['total_amount']
X = x_train.drop('total_amount', axis= 1)

In [21]:
X_t = feature_scaling_and_trans.fit_transform(X=X)

In [22]:
com_train_features, dev_test_features, com_train_labels, dev_test_labels = train_test_split(X_t, y, test_size= 0.2)

In [23]:
linear_reg.fit(com_train_features, com_train_labels)
linear_reg.score(com_train_features, com_train_labels)

0.7751432029403215

In [24]:
linear_reg.score(dev_test_features, dev_test_labels)


0.7773705788803201

In [25]:
x_test = x_test_copy.copy()
y_test = x_test['total_amount']
x_test = x_test.drop('total_amount', axis = 1)
x_test_t = feature_scaling_and_trans.transform(x_test)
linear_reg.score(x_test_t, y_test)

0.7723084890759733

### Recursive features elimination

In [26]:
from sklearn.feature_selection import RFE

selector = RFE(linear_reg, n_features_to_select= 25, step=1, verbose= 1)
selector = selector.fit(com_train_features, com_train_labels)


Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.


In [27]:
selector.support_

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True, False,
        True,  True, False, False, False,  True,  True, False, False,
       False])

In [28]:
selector.ranking_

array([ 1,  1,  1,  8,  1,  1,  1,  1,  1,  1,  1,  1,  1, 10, 11,  1,  1,
        1,  1,  1,  3,  1,  1,  1,  2,  1,  4,  1,  1,  5,  7, 13,  1,  1,
       12,  6,  9])

In [29]:
selector.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x12', 'x15', 'x16', 'x17', 'x18', 'x19', 'x21', 'x22', 'x23',
       'x25', 'x27', 'x28', 'x32', 'x33'], dtype=object)

In [30]:
selector.score(dev_test_features, dev_test_labels)

0.7713482505266431

In [31]:
selector.score(x_test_t, y_test)

0.7659677218948853

## checking cross validation

In [32]:
def printTestResult(result, model:str):
  train_error, test_error = -1 * result['train_score'],  -1 * result['test_score']
  print(f'Mean absolute error of {model} model on the train set:\n',
      f"{train_error.mean():.3f} +/- {train_error.std():.3f}")

  print(f'Mean absolute error of linear {model} on the test set:\n',
      f'{test_error.mean():.3f} +/- {test_error.std():.3f}')

In [33]:
cv_5 = ShuffleSplit(n_splits= 5, test_size= 0.2, random_state= 42)
lin_reg_cross_validation = cross_validate(
  linear_reg,
  com_train_features,
  com_train_labels,
  cv=cv_5,
  scoring= 'neg_mean_squared_error',
  return_train_score= True,
  return_estimator= True)
printTestResult(lin_reg_cross_validation, 'Linear Regression')
lin_reg_cross_validation['estimator'][0].score(dev_test_features, dev_test_labels)

Mean absolute error of Linear Regression model on the train set:
 144.428 +/- 4.005
Mean absolute error of linear Linear Regression on the test set:
 159.404 +/- 19.107


0.7776409260215612

### checking grid search cv

In [34]:
params = {
  'ridge__alpha': [0.1, 0.01, 0.001, 0.5, 0.05, 0.005]
}

In [35]:
lasso_regression = Lasso()
lasso_regression.fit(com_train_features, com_train_labels)

In [36]:
print(lasso_regression.score(com_train_features, com_train_labels))
print(lasso_regression.score(dev_test_features, dev_test_labels))


0.717911183299222
0.7228439752651966


In [37]:
ridge_regression = Ridge()
ridge_regression.fit(com_train_features, com_train_labels)

In [38]:
print(ridge_regression.score(com_train_features, com_train_labels))
print(ridge_regression.score(dev_test_features, dev_test_labels))

0.775126244792969
0.7773602999735575


In [39]:
impute_unknown_values(test_pd)
convert_dt_obj_to_datetime(test_pd, 'tpep_pickup_datetime')
test_pd = test_pd.drop('tpep_pickup_datetime', axis= 1)

convert_dt_obj_to_datetime(test_pd, 'tpep_dropoff_datetime')
test_pd = test_pd.drop('tpep_dropoff_datetime', axis= 1)


VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
extra                    0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
congestion_surcharge     0
Airport_fee              0
dtype: int64


In [40]:
test_pd_t = feature_scaling_and_trans.transform(test_pd)

In [41]:
prediction = lin_reg_cross_validation['estimator'][0].predict(test_pd_t)

In [42]:
submission = pd.DataFrame(columns= ['ID', "total_amount"])
submission['ID'] = [i for i in range(1, 50001)]
submission['total_amount'] = prediction
submission.to_csv('submission.csv', index=False)

### testing SVR algo

In [43]:
svr = SVR(kernel= 'linear')
svr.fit(com_train_features, com_train_labels)

In [None]:
svr.score(com_train_features, com_train_labels)

0.6156343645157708

In [None]:
svr.score(dev_test_features, dev_test_labels)

In [None]:
svr.score(x_test_t, y_test)

In [None]:
submission.describe()

Unnamed: 0,ID,total_amount
count,50000.0,50000.0
mean,25000.5,28.975327
std,14433.901067,21.9813
min,1.0,-93.484337
25%,12500.75,19.378741
50%,25000.5,22.911141
75%,37500.25,27.619915
max,50000.0,280.025106


In [None]:
submission[submission['total_amount'] < 0]

Unnamed: 0,ID,total_amount
12,13,-36.837368
99,100,-22.737280
133,134,-30.433006
199,200,-29.824625
419,420,-27.696411
...,...,...
49639,49640,-51.996603
49714,49715,-52.886292
49741,49742,-28.651833
49858,49859,-23.831409
