In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PowerTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score


Load and clean data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

hospital_id_test = test["Hospital_Id"]
target = "Transport_Cost"
drop_cols = ["Hospital_Id", "Supplier_Name", "Hospital_Location"]

train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)
train = train.dropna(subset=[target])

# Remove lower outliers (same quantile cutoff)
cutoff = train[target].quantile(0.0439)
train = train[train[target] >= cutoff]
print(f"Remaining samples after outlier removal: {len(train)}")

Remaining samples after outlier removal: 4780


Feature groups

In [3]:
num_median = ['Equipment_Height', 'Equipment_Weight', 'Supplier_Reliability']
cat_unknown = ['Equipment_Type', 'Transport_Method', 'Rural_Hospital']
num_nomiss = ['Equipment_Value', 'Base_Transport_Fee']
cat_nomiss = ['Fragile_Equipment', 'Hospital_Info', 'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service']
date_feats = ['Order_Placed_Date', 'Delivery_Date']

Date feature extraction

In [4]:
def compute_date_features(df):
    df = df.apply(pd.to_datetime, format="%m/%d/%y", errors="coerce")
    order, delivery = df.iloc[:, 0], df.iloc[:, 1]
    delivery_days = (delivery - order).dt.days

    order_dow = order.dt.dayofweek.fillna(-1)
    delivery_dow = delivery.dt.dayofweek.fillna(-1)
    order_month = order.dt.month.fillna(0)
    delivery_month = delivery.dt.month.fillna(0)

    order_weekend = order_dow.isin([5, 6]).astype(float)
    delivery_weekend = delivery_dow.isin([5, 6]).astype(float)

    def cyc(x, period):
        x = x.replace(-1, 0)
        radians = 2 * np.pi * x / period
        return np.sin(radians), np.cos(radians)

    order_dow_sin, order_dow_cos = cyc(order_dow, 7)
    order_month_sin, order_month_cos = cyc(order_month, 12)

    return pd.DataFrame({
        "delivery_days": delivery_days,
        "order_dow_sin": order_dow_sin,
        "order_dow_cos": order_dow_cos,
        "order_month_sin": order_month_sin,
        "order_month_cos": order_month_cos,
        "order_is_weekend": order_weekend,
        "delivery_is_weekend": delivery_weekend,
        "order_month": order_month
    }, index=df.index)


Preprocessor

In [5]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('sc', RobustScaler())
    ]), num_median),

    ('cat_unknown', Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent', add_indicator=True)),
        ('enc', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), cat_unknown),

    ('date', Pipeline([
        ('feat', FunctionTransformer(compute_date_features, validate=False)),
        ('imp', SimpleImputer(strategy='median', add_indicator=True)),
        ('sc', RobustScaler(with_centering=False))
    ]), date_feats),

    ('num_nomiss', RobustScaler(), num_nomiss),
    ('cat_nomiss', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_nomiss)
], remainder='drop')

Train-validation split

In [6]:
X = train.drop(columns=[target])
y = train[target].replace([np.inf, -np.inf], np.nan).fillna(0)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

ElasticNet pipeline

In [7]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', TransformedTargetRegressor(
        regressor=ElasticNet(max_iter=20000, random_state=42),
        transformer=PowerTransformer(method='yeo-johnson')
    ))
])

param_grid = {
    'model__regressor__alpha': [0.0001,0.0002,0.0003,0.0007,0.0005],
    'model__regressor__l1_ratio': [0.2, 0.5, 0.8,0.1]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Params: {'model__regressor__alpha': 0.0001, 'model__regressor__l1_ratio': 0.1}


Validation evaluation

In [8]:
y_pred = best_model.predict(X_val)
r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation | R² = {r2:.4f} | RMSE = {rmse:.2f}")

Validation | R² = 0.0143 | RMSE = 297834.28


Train on full data and predict test

In [9]:
best_model.fit(X, y)
test_pred = best_model.predict(test)

pd.DataFrame({
    "Hospital_Id": hospital_id_test,
    "Transport_Cost": test_pred
}).to_csv("submission.csv", index=False)

print("submission.csv created successfully")

submission.csv created successfully


Ridge Regression

In [10]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

ridge_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', TransformedTargetRegressor(
        regressor=Ridge(random_state=42),
        transformer=PowerTransformer(method='yeo-johnson')
    ))
])

ridge_param_grid = {
    'model__regressor__alpha': [0.001, 0.01, 0.1, 0.3, 1, 3, 10]
}

ridge_grid = GridSearchCV(ridge_pipe, ridge_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
ridge_grid.fit(X_train, y_train)

print("Best Ridge Params:", ridge_grid.best_params_)
y_pred = ridge_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Ridge | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

ridge_grid.best_estimator_.fit(X, y)
test_pred = ridge_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_ridge.csv", index=False)
print("submission_ridge.csv created successfully!")


Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Ridge Params: {'model__regressor__alpha': 0.001}
Ridge | R² = 0.0143 | RMSE = 297,834
submission_ridge.csv created successfully!


Lasso regression

In [11]:
from sklearn.linear_model import Lasso

lasso_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', TransformedTargetRegressor(
        regressor=Lasso(max_iter=20000, random_state=42),
        transformer=PowerTransformer(method='yeo-johnson')
    ))
])

lasso_param_grid = {
    'model__regressor__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01]
}

lasso_grid = GridSearchCV(lasso_pipe, lasso_param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
lasso_grid.fit(X_train, y_train)

print("Best Lasso Params:", lasso_grid.best_params_)
y_pred = lasso_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Lasso | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

lasso_grid.best_estimator_.fit(X, y)
test_pred = lasso_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_lasso.csv", index=False)
print("submission_lasso.csv created successfully!")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Lasso Params: {'model__regressor__alpha': 0.0001}
Lasso | R² = 0.0142 | RMSE = 297,837
submission_lasso.csv created successfully!


POlynomial regression (r^2)

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('model', LinearRegression())
])

poly_param_grid = {
    'poly__degree': [2, 3]
}

poly_grid = GridSearchCV(poly_pipe, poly_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
poly_grid.fit(X_train, y_train)

print("Best Polynomial Params:", poly_grid.best_params_)
y_pred = poly_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Polynomial | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

poly_grid.best_estimator_.fit(X, y)
test_pred = poly_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_polynomial.csv", index=False)
print("submission_polynomial.csv created successfully!")


Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best Polynomial Params: {'poly__degree': 2}
Polynomial | R² = 0.0375 | RMSE = 294,309
submission_polynomial.csv created successfully!


Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [8, 10, 12],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2, 3]
}

rf_grid = GridSearchCV(rf_pipe, rf_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)

print("Best RF Params:", rf_grid.best_params_)
y_pred = rf_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RandomForest | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

rf_grid.best_estimator_.fit(X, y)
test_pred = rf_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_randomforest.csv", index=False)
print("submission_randomforest.csv created successfully!")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best RF Params: {'model__max_depth': 8, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 100}
RandomForest | R² = 0.0632 | RMSE = 290,355
submission_randomforest.csv created successfully!


Ada Boost

In [14]:
from sklearn.ensemble import AdaBoostRegressor

ada_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', AdaBoostRegressor(random_state=42))
])

ada_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.05, 0.1, 0.2, 0.4, 0.6]
}

ada_grid = GridSearchCV(ada_pipe, ada_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
ada_grid.fit(X_train, y_train)

print("Best AdaBoost Params:", ada_grid.best_params_)
y_pred = ada_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"AdaBoost | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

ada_grid.best_estimator_.fit(X, y)
test_pred = ada_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_adaboost.csv", index=False)
print("submission_adaboost.csv created successfully!")


Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best AdaBoost Params: {'model__learning_rate': 0.2, 'model__n_estimators': 100}
AdaBoost | R² = 0.1863 | RMSE = 270,603
submission_adaboost.csv created successfully!


Gradient boost

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

gb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

gb_param_grid = {
    'model__n_estimators': [100],
    'model__learning_rate': [0.03, 0.05, 0.1],
    'model__max_depth': [3, 4],
    'model__subsample': [0.8, 0.9, 1.0]
}

gb_grid = GridSearchCV(gb_pipe, gb_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
gb_grid.fit(X_train, y_train)

print("Best GB Params:", gb_grid.best_params_)
y_pred = gb_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"GradientBoosting | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

gb_grid.best_estimator_.fit(X, y)
test_pred = gb_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_gradientboosting.csv", index=False)
print("submission_gradientboosting.csv created successfully!")


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best GB Params: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 100, 'model__subsample': 0.8}
GradientBoosting | R² = 0.1064 | RMSE = 283,574
submission_gradientboosting.csv created successfully!


In [16]:
import xgboost as xgb

xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0))
])

xgb_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.03, 0.05, 0.1],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_grid = GridSearchCV(xgb_pipe, xgb_param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
xgb_grid.fit(X_train, y_train)

print("Best XGB Params:", xgb_grid.best_params_)
y_pred = xgb_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"XGBoost | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

xgb_grid.best_estimator_.fit(X, y)
test_pred = xgb_grid.best_estimator_.predict(test)
pd.DataFrame({"Hospital_Id": hospital_id_test, "Transport_Cost": test_pred}).to_csv("submission_xgboost.csv", index=False)
print("submission_xgboost.csv created successfully!")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best XGB Params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.03, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__subsample': 1.0}
XGBoost | R² = 0.0552 | RMSE = 291,581
submission_xgboost.csv created successfully!


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Define pipeline
linear_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', TransformedTargetRegressor(
        regressor=LinearRegression(),
        transformer=PowerTransformer(method='yeo-johnson')
    ))
])

# Define parameter grid (fit_intercept = True/False just for GridSearch format)
linear_param_grid = {
    'model__regressor__fit_intercept': [True, False],
    'model__check_inverse': [True]  # included to avoid pipeline warnings
}

# GridSearchCV
linear_grid = GridSearchCV(
    linear_pipe,
    param_grid=linear_param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Fit on training data
linear_grid.fit(X_train, y_train)

# Evaluate on validation
print("Best Linear Params:", linear_grid.best_params_)
y_pred = linear_grid.best_estimator_.predict(X_val)

r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Linear Regression | R² = {r2:.4f} | RMSE = {rmse:,.0f}")

# Train on full data and predict test set
linear_grid.best_estimator_.fit(X, y)
test_pred = linear_grid.best_estimator_.predict(test)

# Create submission
pd.DataFrame({
    "Hospital_Id": hospital_id_test,
    "Transport_Cost": test_pred
}).to_csv("submission_linear.csv", index=False)

print(" submission_linear.csv created successfully!")


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Linear Params: {'model__check_inverse': True, 'model__regressor__fit_intercept': True}
Linear Regression | R² = 0.0143 | RMSE = 297,834
 submission_linear.csv created successfully!
