In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [20]:
df = pd.read_csv("clean.csv")

In [21]:
df.head()

Unnamed: 0,Country,Fulfill Via,Vendor INCO Term,Shipment Mode,Sub Classification,Vendor,Pack Price,First Line Designation,Weight (Kilograms),Freight Cost (USD),Year
0,Vietnam,Direct Drop,EXW,Air,Pediatric,Aurobindo Pharma Limited,6.2,Yes,358.0,4521.5,2006
1,Vietnam,Direct Drop,EXW,Air,Adult,SUN PHARMACEUTICAL INDUSTRIES LTD (RANBAXY LAB...,3.99,Yes,1855.0,16007.06,2006
2,Vietnam,Direct Drop,EXW,Air,Adult,Aurobindo Pharma Limited,3.2,Yes,7590.0,28812.57,2006
3,Nigeria,Direct Drop,EXW,Air,Pediatric,Aurobindo Pharma Limited,5.35,Yes,504.0,5920.42,2006
4,Tanzania,Direct Drop,EXW,Air,Adult,Aurobindo Pharma Limited,3.65,Yes,1478.0,6212.41,2006


In [22]:
X = df.drop('Freight Cost (USD)',axis=1)
y=df['Freight Cost (USD)']

In [None]:
# df['log_freight'] = np.log1p(df['Freight Cost (USD)'])  # log(1 + x) handles 0s safely

In [None]:
# y = df['log_freight'] 

In [25]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [26]:
X = preprocessor.fit_transform(X)

In [27]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((5358, 87), (1340, 87))

In [28]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [29]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'max_features': ['sqrt', 0.8],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestRegressor(), params, cv=3, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)
print("Best Params:", grid.best_params_)


Best Params: {'max_depth': None, 'max_features': 0.8, 'min_samples_split': 2, 'n_estimators': 200}


In [30]:
best_model = grid.best_estimator_

In [31]:
best_model 

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Predict
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Reverse log1p transform to get predictions in original USD scale
y_train_pred_actual = np.expm1(y_train_pred)
y_test_pred_actual = np.expm1(y_test_pred)


# Reverse for true values too
y_train_actual = np.expm1(y_train)
y_test_actual = np.expm1(y_test)

# Metrics
def evaluate(y_true, y_pred, dataset_name=""):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print(f"{dataset_name} Set Evaluation:")
    print(f"- RMSE: {rmse:.4f}")
    print(f"- MAE: {mae:.4f}")
    print(f"- R²: {r2:.4f}")
    print(f"- MAPE: {mape:.2f}%")
    print("-" * 40)

# Evaluate both sets
evaluate(y_train_actual, y_train_pred_actual, "Training")
evaluate(y_test_actual, y_test_pred_actual, "Test")


Training Set Evaluation:
- RMSE: 2004.1144
- MAE: 1040.0879
- R²: 0.9498
- MAPE: 11.80%
----------------------------------------
Test Set Evaluation:
- RMSE: 3983.0792
- MAE: 2371.3467
- R²: 0.7973
- MAPE: 34.42%
----------------------------------------
