In [216]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import os

In [217]:
DIRECTORY = os.getcwd() + '/data-tech-challenge'

In [218]:
plane_data_df = pd.read_csv(DIRECTORY + '/data/plane_data_cleaned.csv')
plane_data_df.rename(columns={
    'Month': 'Date'
}, inplace=True)

plane_data_df['Route'] = plane_data_df['AustralianPort'] + ' - ' + plane_data_df['ForeignPort']

adelaide_to_frankfurt_df = plane_data_df[plane_data_df['Route'] == 'Adelaide - Frankfurt']

display(adelaide_to_frankfurt_df.head())

Unnamed: 0,Date,AustralianPort,ForeignPort,Country,Passengers_In,Freight_In_(tonnes),Mail_In_(tonnes),Passengers_Out,Freight_Out_(tonnes),Mail_Out_(tonnes),Passengers_Total,Freight_Total_(tonnes),Mail_Total_(tonnes),Year,Month_num,Route
3,1985-01-01,Adelaide,Frankfurt,Germany,115,0.009,0.0,171,0.0,0.248,286,0.009,0.248,1985,1,Adelaide - Frankfurt
164,1985-02-01,Adelaide,Frankfurt,Germany,235,0.021,0.0,146,0.0,0.244,381,0.021,0.244,1985,2,Adelaide - Frankfurt
323,1985-03-01,Adelaide,Frankfurt,Germany,133,2.847,0.0,192,0.025,0.325,325,2.872,0.325,1985,3,Adelaide - Frankfurt
480,1985-04-01,Adelaide,Frankfurt,Germany,84,0.222,0.0,113,0.0,0.385,197,0.222,0.385,1985,4,Adelaide - Frankfurt
644,1985-05-01,Adelaide,Frankfurt,Germany,0,0.0,0.0,79,0.0,0.243,79,0.0,0.243,1985,5,Adelaide - Frankfurt


### Linear Regression Test

In [219]:
model_df = adelaide_to_frankfurt_df.copy()

# select features
X = model_df[['Year', 'Month_num']]
y = model_df['Passengers_Total']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# fit model
reg = LinearRegression()
reg.fit(X_train, y_train)

# predict on test
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = reg.score(X_test, y_test)

print(f"mse for linear regression: {mse:.2f}")
print(f"R² score for linear regression: {r2:.3f}")

mse for linear regression: 6018.60
R² score for linear regression: 0.116


Interpretation: 
MSE is very high (probs because the passenger numbers are also very high -- the mae is around 77 which might be better. Still a bit on the higher end, but makes more sense with contextualization of the scale of passengers).

r^2 - model does a pretty bad job of capturing patterns in the data

Not a great model

# Lasso/Ridge/Elastic Net

In [220]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# lasso regression
lasso = Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = lasso.score(X_test_scaled, y_test)

print(f"mse for lasso regression: {mse:.2f}")
print(f"R² score for lasso regression: {r2:.3f}")

mse for lasso regression: 6018.60
R² score for lasso regression: 0.116


In [221]:
ridge = Ridge()
ridge = ridge.fit(X_train_scaled,y_train)
y_pred = ridge.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = reg.score(X_test_scaled, y_test)

print(f"mse for ridge regression : {mse:.2f}")
print(f"R² score for ridge regression: {r2:.3f}")

mse for ridge regression : 6024.48
R² score for ridge regression: -82247.176




In [222]:
elastic_net = ElasticNet()
elastic_net = elastic_net.fit(X_train_scaled,y_train)
y_pred = elastic_net.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = ridge.score(X_test_scaled, y_test)

print(f"mse for elastic net regression : {mse:.2f}")
print(f"R² score for elastic net regression: {r2:.3f}")

mse for elastic net regression : 6161.70
R² score for elastic net regression: 0.115


Overall, Lasso/Ridge/Elastic Net models did on average, a little bit worse than the regular Linear Regression model based on MSE and r^2 metrics

### Polynomial Regression

In [223]:
X_poly_train = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X_train)
X_poly_test = PolynomialFeatures(degree=2, include_bias=True).fit_transform(X_test)

X_train_poly = scaler.fit_transform(X_poly_train)
X_test_poly = scaler.transform(X_poly_test)

reg = LinearRegression()
reg.fit(X_train_poly, y_train)

# predict on test
y_pred = reg.predict(X_test_poly)

mse = mean_squared_error(y_test, y_pred)
r2 = reg.score(X_test_poly, y_test)

print(f"mse for linear regression: {mse:.2f}")
print(f"R² score for linear regression: {r2:.3f}")

mse for linear regression: 2960.96
R² score for linear regression: 0.565


Getting closer! Reduced mse and higher r^2 implies that this model fits slightly better and that the relationship between the features & passenger count may not be linear

### Decision Tree

In [224]:
dt_reg = DecisionTreeRegressor(
    random_state=42, 
    max_depth=10, 
    min_samples_split=10,
    min_samples_leaf=5
)

# Fit the model
dt_reg.fit(X_train, y_train)

# Predict on test set
y_pred_dt = dt_reg.predict(X_test)

# Calculate metrics
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = dt_reg.score(X_test, y_test)

print(f"mse for decision tree: {mse_dt:.2f}")
print(f"r^2 for decision tree: {r2_dt:.3f}")

mse for decision tree: 3461.13
r^2 for decision tree: 0.491


### Random Forest

In [225]:
rf_reg = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_reg.fit(X_train, y_train)

y_pred_rf = rf_reg.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = rf_reg.score(X_test, y_test)

print(f"mse for rfr: {mse_rf:.2f}")
print(f"r^2 Score for rfr: {r2_rf:.3f}")

mse for rfr: 2676.69
r^2 Score for rfr: 0.607


RFR performed the best so far! With the lowest MSE (which has an MAE of about 50, which would make total sense while understanding the scale of the passengers) and the highest R^2 value (suggesting that the RFR model is very good at capturing patterns/the variance in the total passenger numbers), this model is pretty good.