In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the training dataset
train_df = pd.read_csv('cargo-train.csv')

# Drop any null values
train_df = train_df.dropna()

# Split 'Activity Period' into year and month, then get sin and cos of month to capture its cyclical nature
train_df['Year'] = train_df['Activity Period'].apply(lambda x: int(str(x)[:4]))
train_df['Month'] = train_df['Activity Period'].apply(lambda x: int(str(x)[4:]))
train_df['Month_sin'] = np.sin(2 * np.pi * train_df['Month'] / 12)
train_df['Month_cos'] = np.cos(2 * np.pi * train_df['Month'] / 12)
train_df = train_df.drop(['Activity Period'], axis=1)

# Select top features for training
X = train_df[['Year', 'Month_sin', 'Month_cos', 'Published Airline', 'Cargo Type Code', 'Cargo Aircraft Type', 'GEO Region', 'GEO Summary']]
y = train_df['Cargo Metric TONS']

# One-Hot Encoding for categorical variables
X_encoded = pd.get_dummies(X)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train model
rf.fit(X_train, y_train)

# Validate model and use root mean squared error as metric
y_pred = rf.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"RMSE: {rmse}")

RMSE: 204.5730830159587


In [21]:
# Apply to test dataset now
test_df = pd.read_csv('cargo-test.csv')

# Drop any null values
test_df = test_df.dropna()

# Split 'Activity Period' into year and month, then get sin and cos of month to capture its cyclical nature
test_df['Year'] = test_df['Activity Period'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Activity Period'].apply(lambda x: int(str(x)[4:]))
test_df['Month_sin'] = np.sin(2 * np.pi * test_df['Month'] / 12)
test_df['Month_cos'] = np.cos(2 * np.pi * test_df['Month'] / 12)
test_df = test_df.drop(['Activity Period'], axis=1)

# Keep same columns as previous data frame
test_features = test_df[['Year', 'Month_sin', 'Month_cos', 'Published Airline', 'Cargo Type Code', 'Cargo Aircraft Type', 'GEO Region']]

# Select top features for training
test_encoded = pd.get_dummies(test_features)

# Ensure the test set has same columns as training set
test_encoded_final = test_encoded.reindex(columns=X_train.columns, fill_value=0)

# Predict on the test set using trained model
test_predictions = rf.predict(test_encoded_final)

# Output predictions, save in .csv file
test_predictions_df = pd.DataFrame(test_predictions, columns=['Cargo Metric Tons'])
test_predictions_df.to_csv('dcl.csv', index=False)

In [22]:
test_predictions_df

Unnamed: 0,Cargo Metric Tons
0,226.844231
1,274.059292
2,143.915311
3,173.843127
4,270.082454
...,...
6788,8.403793
6789,392.144299
6790,3.472947
6791,744.463333
