<a href="https://colab.research.google.com/github/aet-lang/Contract-Awards-in-Investment-Project-Financing/blob/main/Contract_Awards_in_Investment_Project_Financing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from tabulate import tabulate

## Load the dataset

In [None]:
!pip install -q gdown

import gdown

file_id = "1N6_PBk1T2PvhaATWsSHb9-vJYEnPcOlM"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "contract_data.csv", quiet=False)

df = pd.read_csv("contract_data.csv")
df.head()

## Dataset information

In [None]:
df.info()

## Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

## Fill all missing values

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

## Visualize missing data

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values in Dataset")
plt.show()

### Convert date columns

In [None]:
df["As of Date"] = pd.to_datetime(df["As of Date"])
df["Contract Signing Date"] = pd.to_datetime(df["Contract Signing Date"])

### Target Distribution

In [None]:
target_col = "Supplier Contract Amount (USD)"
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found. Available columns: {df.columns.tolist()}")

plt.figure(figsize=(10, 5))
sns.histplot(df[target_col], bins=50, kde=True, color='blue')
plt.title("Distribution of Supplier Contract Amount (Before Outlier Removal)")
plt.xlabel("Supplier Contract Amount (USD)")
plt.ylabel("Frequency")
plt.show()

## Boxplot of target

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x=df["Supplier Contract Amount (USD)"])
plt.title("Boxplot of Supplier Contract Amount ")
plt.show()

### Encode Categorical Columns


In [None]:
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

### Target Separation


In [None]:
target_col = "Supplier Contract Amount (USD)"
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found. Available columns: {df.columns.tolist()}")

### Distribution of Target

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df[target_col], bins=50, kde=True, color='blue')
plt.title("Distribution of Supplier Contract Amount (Before Outlier Removal)")
plt.xlabel("Supplier Contract Amount (USD)")
plt.ylabel("Frequency")
plt.show()

### Outlier removal

In [None]:
Q1 = df[target_col].quantile(0.25)
Q3 = df[target_col].quantile(0.75)
IQR = Q3 - Q1
filtered_df = df[(df[target_col] >= Q1 - 1.5 * IQR) & (df[target_col] <= Q3 + 1.5 * IQR)].copy()



### Label Encode

In [None]:
label_encoders = {}
for col in filtered_df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    filtered_df[col] = le.fit_transform(filtered_df[col].astype(str))
    label_encoders[col] = le



### Prepare Features and Labels

In [None]:
X = filtered_df.drop(columns=[target_col, "As of Date", "Contract Signing Date"])
y = filtered_df[target_col]

### Feature Scaling

In [None]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

### Helper Function for Model Evaluation

In [None]:
results = []

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"name": name, "mae": mae, "mse": mse, "r2": r2})
    print(f"\n🔹 {name} Performance:")
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")
    return model

### Train Baseline Models


In [None]:
lr_model = evaluate_model("Linear Regression", LinearRegression(), X_train, y_train, X_test, y_test)
rf_model = evaluate_model("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42), X_train, y_train, X_test, y_test)
gb_model = evaluate_model("Gradient Boosting", GradientBoostingRegressor(random_state=42), X_train, y_train, X_test, y_test)
xgb_model = evaluate_model("XGBoost", XGBRegressor(random_state=42, objective='reg:squarederror'), X_train, y_train, X_test, y_test)


### Cross-validation scores

In [None]:
def cross_validate_model(model, name):
    scores = cross_val_score(model, X_scaled, y, scoring='r2', cv=5)
    print(f"\n{name} CV R² Scores: {scores}")
    print(f"Mean R²: {scores.mean():.4f}, Std: {scores.std():.4f}")

cross_validate_model(lr_model, "Linear Regression")
cross_validate_model(rf_model, "Random Forest")
cross_validate_model(gb_model, "Gradient Boosting")
cross_validate_model(xgb_model, "XGBoost")

### Hyperparameter Tuning

In [None]:
# Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print("\nBest RF Params:", rf_grid.best_params_)
evaluate_model("Random Forest (Tuned)", rf_grid.best_estimator_, X_train, y_train, X_test, y_test)

# Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5]
}
gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, cv=3, scoring='r2', n_jobs=-1)
gb_grid.fit(X_train, y_train)
print("\nBest GB Params:", gb_grid.best_params_)
evaluate_model("Gradient Boosting (Tuned)", gb_grid.best_estimator_, X_train, y_train, X_test, y_test)

# XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}
xgb_grid = GridSearchCV(XGBRegressor(random_state=42, objective='reg:squarederror', n_jobs=-1),
                        xgb_param_grid, cv=3, scoring='r2', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
print("\nBest XGB Params:", xgb_grid.best_params_)
evaluate_model("XGBoost (Tuned)", xgb_grid.best_estimator_, X_train, y_train, X_test, y_test)


### Final Summary Table

In [None]:
final_results = []
for res in results:
    final_results.append([res['name'], res['mae'], res['mse'], res['r2']])

headers = ["Model", "MAE", "MSE", "R²"]
print("\n Final Model Comparison:")
print(tabulate(final_results, headers=headers, floatfmt=".4f"))