In [None]:
import os
import pickle #pickle ‚Üí saves model for later use
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer ‚Üí converts text ‚Üí numeric features
# ML models ‚Üí Logistic, Decision Tree, Random Forest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate #StratifiedKFold ‚Üí balanced cross-validation
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np

In [None]:
def train_and_evaluate(input_file, models_dir):
    print(f"Loading data from {input_file}...")
    df = pd.read_csv(input_file)

    X = df['cleaned_text'].fillna('')
    y = df['risk_level']

    os.makedirs(models_dir, exist_ok=True) # make folder if it doesn't exist

    # 5-Fold Stratified Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Models inside Pipeline (NO DATA LEAKAGE)
    models = {
        'Logistic_Regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            random_state=42
        ),
        'Decision_Tree': DecisionTreeClassifier(
            max_depth=25,
            random_state=42
        ),
        'Random_Forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=25,
            random_state=42
        )
    }

    best_score = 0
    best_name = ""
    best_pipeline = None

    results = {}

    for name, model in models.items():
        print(f"\nüîπ Evaluating {name} with 5-Fold Cross Validation...")

        # Without pipeline: TF-IDF fits on entire dataset that Causes data leakage
        # With pipeline: TF-IDF fits only on training fold
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=5000,
                stop_words='english'
            )),
            ('classifier', model)
        ])

        scores = cross_validate(
            pipeline,
            X,
            y,
            cv=cv,
            scoring=['accuracy', 'f1_weighted'],
            return_train_score=False
        )

        mean_acc = np.mean(scores['test_accuracy'])
        mean_f1 = np.mean(scores['test_f1_weighted'])

        results[name] = {
            "CV Accuracy": round(mean_acc, 4),
            "CV F1 (Weighted)": round(mean_f1, 4)
        }

        print(f"{name} ‚Üí Accuracy: {mean_acc:.4f}")
        print(f"{name} ‚Üí F1 Score: {mean_f1:.4f}")

        # Choose best model based on F1 (better for classification)
        if mean_f1 > best_score:
            best_score = mean_f1
            best_name = name
            best_pipeline = pipeline

    print(f"\nüèÜ Best Model: {best_name} (F1: {best_score:.4f})")

    # Train best model on full dataset
    best_pipeline.fit(X, y)

    # Save best model
    with open(os.path.join(models_dir, 'best_model.pkl'), 'wb') as f:
        pickle.dump(best_pipeline, f)

    print("‚úÖ Best model saved successfully.")

    return results


if __name__ == "__main__":
    try:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    except NameError:
        base_dir = os.path.dirname(os.getcwd()) #Gets current file location
    input_path = os.path.join(base_dir, "data", "processed", "processed_contracts.csv")
    models_dir = os.path.join(base_dir, "models")

    results = train_and_evaluate(input_path, models_dir)
    print("\nFinal Results:", results)

Loading data from /Users/ash/CascadeProjects/projects..../contrack_risk_analyser/data/processed/processed_contracts.csv...

üîπ Evaluating Logistic_Regression with 5-Fold Cross Validation...
Logistic_Regression ‚Üí Accuracy: 0.8856
Logistic_Regression ‚Üí F1 Score: 0.8859

üîπ Evaluating Decision_Tree with 5-Fold Cross Validation...
Decision_Tree ‚Üí Accuracy: 0.8117
Decision_Tree ‚Üí F1 Score: 0.8084

üîπ Evaluating Random_Forest with 5-Fold Cross Validation...
Random_Forest ‚Üí Accuracy: 0.8138
Random_Forest ‚Üí F1 Score: 0.8058

üèÜ Best Model: Logistic_Regression (F1: 0.8859)
‚úÖ Best model saved successfully.

Final Results: {'Logistic_Regression': {'CV Accuracy': np.float64(0.8856), 'CV F1 (Weighted)': np.float64(0.8859)}, 'Decision_Tree': {'CV Accuracy': np.float64(0.8117), 'CV F1 (Weighted)': np.float64(0.8084)}, 'Random_Forest': {'CV Accuracy': np.float64(0.8138), 'CV F1 (Weighted)': np.float64(0.8058)}}
