## Data Loading and Preprocessing

In [2]:
# only libraries allowed are:
# Scikit-learn, XGBoost, Imblearn, NumPy
# Pandas, SciPy, Pickle, regex
# Seaborn, Matplotlib, Lightgbm

from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
import pandas as pd

from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
        
XYTrain = pd.read_csv('train.csv')
XTest = pd.read_csv('test.csv') # naturally, there is no Y in the testing set

def preprocess(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).abs().dt.total_seconds()
    
    df['passenger_count'].fillna(df['passenger_count'].mode()[0], inplace=True)
    df['payment_type'].replace('unknown', 'Credit Card', inplace = True)
    df['RatecodeID'].fillna(df['RatecodeID'].mode()[0], inplace=True)
    df['congestion_surcharge'].fillna(0, inplace=True)
    df['store_and_fwd_flag'].fillna('N',inplace=True)
    df['Airport_fee'].fillna(0, inplace=True)
    
    df = pd.get_dummies(df, columns=['store_and_fwd_flag'], prefix=['store_and_fwd_flag'])
    df = pd.get_dummies(df, columns=['payment_type'], prefix=['payment_type'])
    
    df['improvement_surcharge'] = df['improvement_surcharge'].abs()
    df['congestion_surcharge'] = df['congestion_surcharge'].abs()
    df['tolls_amount'] = df['tolls_amount'].abs()
    df['Airport_fee'] = df['Airport_fee'].abs()
    df['extra'] = df['extra'].abs()
    
    #df = df.drop(columns=['VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID'])
    
    try:
        df['total_amount'] = df['total_amount'].abs()
        Y = df['total_amount']
        df = df.drop(columns=['total_amount', 'tpep_dropoff_datetime', 'tpep_pickup_datetime'])
        
        numericalCols = [x for x in df.select_dtypes(include = 'number').columns.to_list()] #if 'ID' not in x]
        df[numericalCols] = MinMaxScaler().fit_transform(df[numericalCols])
        
        return train_test_split(df, Y, test_size=0.2, random_state=42)
    except:
        df = df.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime'])
        
        numericalCols = [x for x in df.select_dtypes(include = 'number').columns.to_list()] #if 'ID' not in x]
        df[numericalCols] = MinMaxScaler().fit_transform(df[numericalCols])
        
        return df

XTest = preprocess(XTest)
XTrain, XVal, YTrain, YVal = preprocess(XYTrain)

best_models_raw = {}
best_models_tuned = {}

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Baseline LR Model

In [3]:
model = Pipeline([
    ('poly', PolynomialFeatures()),
    ('regressor', LinearRegression())
])

param_grid = {
    'poly__degree': [2, 3],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(XTrain, YTrain)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

from sklearn.metrics import r2_score
YPred = best_model.predict(XVal)
print(f'R2 score: {r2_score(YPred, YVal)}')
print(f'Best params: {best_params}')

R2 score: 0.8772601173373252
Best params: {'poly__degree': 2}


## Untuned Run

In [14]:
regressors = {
    'Linear Regression': (LinearRegression(), {}),
    'K-Nearest Neighbors Regressor': (KNeighborsRegressor(), {}),
    'Support Vector Regressor': (SVR(), {}),
    'Decision Tree Regressor': (DecisionTreeRegressor(), {}),
    'Bagging Regressor': (BaggingRegressor(estimator=DecisionTreeRegressor()), {}),
    'AdaBoost Regressor': (AdaBoostRegressor(estimator=DecisionTreeRegressor()), {}),
    'Multi-Layer Perceptron Regressor': (MLPRegressor(), {})
}

for regressor_name, (regressor, param_grid) in regressors.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('regressor', regressor)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(XTrain, YTrain)
    
    best_models_raw[regressor_name] = {
        'model': grid_search.best_estimator_,
        'params': grid_search.best_params_
    }
    
    YPred = grid_search.best_estimator_.predict(XVal)

r2_table_raw, i = pd.DataFrame(columns=['Model Name', 'R2 Score on Training','R2 Score on Validation', '% Difference in R2']), 0
for regressor_name, results in best_models_raw.items():
    YPredVal = results["model"].predict(XVal)
    YPredTrn = results["model"].predict(XTrain)
    r2_val = r2_score(YPredVal, YVal)
    r2_trn = r2_score(YPredTrn, YTrain)
    r2_table_raw.loc[i] = [regressor_name, r2_trn, r2_val, abs(r2_trn-r2_val)*100/abs(r2_trn)]
    i=i+1

LR	model trained with an R-squared scores of 0.8774 on the validation set and 0.8736 on the training set.
KNN	model trained with an R-squared scores of 0.6824 on the validation set and 0.7921 on the training set.
CART	model trained with an R-squared scores of 0.9111 on the validation set and 1.0000 on the training set.
MLP	model trained with an R-squared scores of 0.8109 on the validation set and 0.4096 on the training set.
Bagging	model trained with an R-squared scores of 0.9466 on the validation set and 0.9877 on the training set.
Boosting	model trained with an R-squared scores of 0.9490 on the validation set and 1.0000 on the training set.
SVM	model trained with an R-squared scores of -0.1158 on the validation set and -0.1123 on the training set.


## Tuned Run

In [6]:
regressors = {
    #'Decision Tree Regressor (Tuned)': (DecisionTreeRegressor(), {'regressor__max_depth': [None, 5, 10, 20], 'regressor__ccp_alpha': [0.01, 0.1, 1]}),
    #'Bagging Regressor (Tuned)': (BaggingRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [10, 50, 100]}),
    'AdaBoost Regressor (Tuned)': (AdaBoostRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [10, 50, 100], 'regressor__learning_rate': [0.1, 1.0, 2.0], 'regressor__loss': ['linear', 'square', 'exponential']}),
}

for regressor_name, (regressor, param_grid) in regressors.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('regressor', regressor)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(XTrain, YTrain)
    
    best_models_tuned[regressor_name] = {
        'model': grid_search.best_estimator_,
        'params': grid_search.best_params_
    }
    

r2_table_tuned, i = pd.DataFrame(columns=['Model Name', 'R2 Score on Training','R2 Score on Validation', '% Difference in R2']), 0
for regressor_name, results in best_models_tuned.items():
    YPredVal = results["model"].predict(XVal)
    YPredTrn = results["model"].predict(XTrain)
    r2_val = r2_score(YPredVal, YVal)
    r2_trn = r2_score(YPredTrn, YTrain)
    r2_table_tuned.loc[i] = [regressor_name, r2_trn, r2_val, abs(r2_trn-r2_val)*100/abs(r2_trn)]
    i=i+1
    
r2_table_tuned



Unnamed: 0,Model Name,R2 Score on Training,R2 Score on Validation,% Difference in R2
2,Decision Tree Regressor (Tuned),0.952982,0.926811,2.746227
3,Bagging Regressor (Tuned),0.99057,0.951213,3.973134
4,AdaBoost Regressor (Tuned),0.999911,0.951514,4.84012


In [12]:
best_models_tuned['AdaBoost Regressor (Tuned)']

{'model': Pipeline(steps=[('scaler', StandardScaler()), ('poly', PolynomialFeatures()),
                 ('regressor',
                  AdaBoostRegressor(estimator=DecisionTreeRegressor(),
                                    learning_rate=2.0, loss='exponential',
                                    n_estimators=100))]),
 'params': {'regressor__learning_rate': 2.0,
  'regressor__loss': 'exponential',
  'regressor__n_estimators': 100}}

In [None]:
XTrain = PolynomialFeatures(degree=2).fit_transform(XTrain)

final_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(),
                          learning_rate=2.0, 
                          loss='exponential',
                          n_estimators=100)

final_model.fit(XTrain, YTrain)

YPred = final_model.predict(XTest)
OUTdf = pd.DataFrame()
OUTdf['ID'], OUTdf['total_amount'] = [x for x in range(1, XTest.shape[0]+1)], YPred

OUTdf.to_csv('submission.csv', index = False)

In [None]:
regressors = {
    'Decision Tree Regressor (Tuned)': (DecisionTreeRegressor(), {'regressor__max_depth': [10], 'regressor__ccp_alpha': [0.01]}),
    'Bagging Regressor (Tuned)': (BaggingRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [100]}),
    'AdaBoost Regressor (Tuned)': (AdaBoostRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [100], 'regressor__learning_rate': [2.0], 'regressor__loss': ['exponential']}),
}

for regressor_name, (regressor, param_grid) in regressors.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('regressor', regressor)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(XTrain, YTrain)
    
    best_models_tuned[regressor_name] = {
        'model': grid_search.best_estimator_,
        'params': grid_search.best_params_
    }
    

r2_table_tuned, i = pd.DataFrame(columns=['Model Name', 'R2 Score on Training','R2 Score on Validation', '% Difference in R2']), 2
for regressor_name, results in best_models_tuned.items():
    YPredVal = results["model"].predict(XVal)
    YPredTrn = results["model"].predict(XTrain)
    r2_val = r2_score(YPredVal, YVal)
    r2_trn = r2_score(YPredTrn, YTrain)
    r2_table_tuned.loc[i] = [regressor_name, r2_trn, r2_val, abs(r2_trn-r2_val)*100/abs(r2_trn)]
    i=i+1
    
r2_table_tuned

## Additional Models

In [None]:
regressors = {
    'Decision Tree Regressor (Tuned)': (DecisionTreeRegressor(), {'regressor__max_depth': [None, 5, 10, 20], 'regressor__ccp_alpha': [0.01, 0.1, 1]}),
    'Bagging Regressor (Tuned)': (BaggingRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [10, 50, 100]}),
    'AdaBoost Regressor (Tuned)': (AdaBoostRegressor(estimator=DecisionTreeRegressor()), {'regressor__n_estimators': [10, 50, 100], 'regressor__learning_rate': [0.1, 1.0, 2.0], 'regressor__loss': ['linear', 'square', 'exponential']}),
}

for regressor_name, (regressor, param_grid) in regressors.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2)),
        ('regressor', regressor)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(XTrain, YTrain)
    
    best_models_raw[regressor_name] = {
        'model': grid_search.best_estimator_,
        'params': grid_search.best_params_
    }
    

r2_table_tuned, i = pd.DataFrame(columns=['Model Name', 'R2 Score on Training','R2 Score on Validation', '% Difference in R2']), 0
for regressor_name, results in best_models_raw.items():
    YPredVal = results["model"].predict(XVal)
    YPredTrn = results["model"].predict(XTrain)
    r2_val = r2_score(YPredVal, YVal)
    r2_trn = r2_score(YPredTrn, YTrain)
    r2_table_tuned.loc[i] = [regressor_name, r2_trn, r2_val, abs(r2_trn-r2_val)*100/abs(r2_trn)]
    i=i+1
    
r2_table_tuned