## Step 1: Imports and Data Loading

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import ast

ModuleNotFoundError: No module named 'pandas'

## Step 2: Load & Clean Dataset

In [5]:
df = pd.read_csv("accreditation_data_2.csv")

# --- Convert 'Days_Until_Deadline' ---

def get_days(days):
    if "Late" in days:
        return -int(days.split(" ")[-2])
    else:
        return int(days.split(" ")[0])
    
df["Days_Until_Deadline"] = df["Days_Until_Deadline"].apply(get_days)

# --- Convert embedding from str to list of floats ---

df["embedding"] = df["embedding"].apply(lambda x: np.array(ast.literal_eval(x)))


NameError: name 'pd' is not defined

## Step 3: Feature Selection

In [13]:
# Target Column
y = df["Area_Rating"]


# Expand embeddings into columns
embeddings = pd.DataFrame(df["embedding"].to_list(), index=df.index)
embeddings.columns = [f"emb_{i}" for i in range(embeddings.shape[1])]

X = pd.concat([
    df[["file_size", "Criteria_Completion_Score", "Days_Until_Deadline", "isApproved", "docType", "programID"]],
    embeddings
], axis=1)


## Step 4: Preprocessing Pipeline

In [14]:
num_features = ["file_size", "Criteria_Completion_Score", "Days_Until_Deadline"]
cat_features = ["isApproved", "docType", "programID"]

# Preprocessor

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("pass", "passthrough", embeddings.columns)
    ]
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])


## Step 5: Train and Evaluate

In [15]:
# Train / Test Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model.fit(X_train,y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R²: {r2:.3f}")

RMSE: 0.580
MAE: 0.481
R²: 0.582


# RandomForestRegressor

## Step 1: Import Models

In [16]:
from sklearn.ensemble import RandomForestRegressor

## Step 2: Build pipeline

In [17]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=1
    ))
])

## Step 3: Train & Evaluate RandomForest

In [18]:
# Train

rf_model.fit(X_train, y_train)

# Predict

y_pred_rf = rf_model.predict(X_test)

# Metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("RandomForest Results:")
print(f"RMSE: {rmse_rf:.3f}")
print(f"MAE: {mae_rf:.3f}")
print(f"R²: {r2_rf:.3f}")

RandomForest Results:
RMSE: 0.289
MAE: 0.230
R²: 0.897


## Step 4: Feature Importance (Extra)

In [22]:
# Get feature names
feature_names = (
    num_features
    + list(rf_model.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(cat_features))
    + list(embeddings.columns)
)

# Get importances
importances = rf_model.named_steps["regressor"].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print(feat_imp.head(15))

isApproved_False             0.341367
isApproved_True              0.226131
Days_Until_Deadline          0.048254
emb_0                        0.042057
file_size                    0.036909
emb_4                        0.034806
emb_5                        0.032129
emb_7                        0.032076
emb_8                        0.030739
emb_9                        0.029386
emb_3                        0.029136
emb_6                        0.028057
emb_1                        0.027791
emb_2                        0.023600
Criteria_Completion_Score    0.019851
dtype: float64


# XGBoostRegressor Model

## Step 1: Import Model

In [23]:
from xgboost import XGBRegressor

## Step 2: Build Pipeline

In [24]:
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        n_estimators=500,
        learning_rate=0.5,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=1
    ))
])

## Step 3: Train & Evaluate XGBoost

In [26]:
# Train 
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)


# Metrics
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Results:")
print(f"RMSE: {rmse_xgb:.3f}")
print(f"MAE: {mae_xgb:.3f}")
print(f"R²: {r2_xgb:.3f}")

XGBoost Results:
RMSE: 0.235
MAE: 0.076
R²: 0.931


## Comparison between models

In [28]:
print("\nModel Comparison:")
print(f"Baseline -> RMSE: {rmse:.3f}, MAE {mae:.3f}, R²: {r2:.3f}")
print(f"Random Forest -> RMSE: {rmse_rf:.3f}, MAE: {mae_rf:.3f}, R²: {r2_rf:.3f}")
print(f"XGBoost       -> RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, R²: {r2_xgb:.3f}")



Model Comparison:
Baseline -> RMSE: 0.580, MAE 0.481, R²: 0.582
Random Forest -> RMSE: 0.289, MAE: 0.230, R²: 0.897
XGBoost       -> RMSE: 0.235, MAE: 0.076, R²: 0.931


# Fine Tune XGBoost

## Step 1: Import

In [29]:
from sklearn.model_selection import GridSearchCV

## Step 2: Define Hyperparameter Grid

In [30]:
param_grid = {
    "regressor__n_estimators": [200, 500, 800],
    "regressor__max_depth": [4, 6, 8],
    "regressor__learning_rate": [0.01, 0.05, 0.1],
    "regressor__subsample": [0.7, 0.8, 1.0],
    "regressor__colsample_bytree": [0.7, 0.8, 1.0]
}

## Step 3: Wrap in GridSearchCV

In [31]:
# GridSearch with 5-fold CV

grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=1,
    verbose=2
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regressor__max_depth=4, regressor__n_estimators=200, regressor__subsample=0.7; total time=   0.2s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regressor__max_depth=4, regressor__n_estimators=200, regressor__subsample=0.7; total time=   0.1s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regressor__max_depth=4, regressor__n_estimators=200, regressor__subsample=0.7; total time=   0.2s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regressor__max_depth=4, regressor__n_estimators=200, regressor__subsample=0.7; total time=   0.1s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regressor__max_depth=4, regressor__n_estimators=200, regressor__subsample=0.7; total time=   0.1s
[CV] END regressor__colsample_bytree=0.7, regressor__learning_rate=0.01, regresso

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'regressor__colsample_bytree': [0.7, 0.8, ...], 'regressor__learning_rate': [0.01, 0.05, ...], 'regressor__max_depth': [4, 6, ...], 'regressor__n_estimators': [200, 500, ...], ...}"
,scoring,'neg_mean_squared_error'
,n_jobs,1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [33]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)

#Best Model
best_xgb = grid_search.best_estimator_

# Evaluate on test set
y_pred_best = best_xgb.predict(X_test)

rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

Best Parameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 8, 'regressor__n_estimators': 800, 'regressor__subsample': 0.7}


In [34]:
print("Tuned XGBoost Results:")
print(f"RMSE: {rmse_best:.3f}")
print(f"MAE: {mae_best:.3f}")
print(f"R²: {r2_best:.3f}")

Tuned XGBoost Results:
RMSE: 0.174
MAE: 0.055
R²: 0.962
