<a href="https://colab.research.google.com/github/builtbypyro/builtbypyro/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Step 1 -- press play if you are on phone{ display-mode: "form" }
%%html
<audio autoplay="" src="https://raw.githubusercontent.com/KoboldAI/KoboldAI-Client/main/colab/silence.m4a" loop controls>

In [None]:
!pip install optuna
!pip install xgboost
!pip install lightgbm
!pip install transformers

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
%%writefile script.py
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"

import json
import os
import re
import warnings
import joblib
import numpy as np
import pandas as pd
import torch
import optuna

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

import xgboost as xgb
import lightgbm as lgb

from transformers import DistilBertTokenizer, DistilBertModel

warnings.filterwarnings("ignore")


def normalize_schools(schools):
    if isinstance(schools, dict):
        return [{"University": k, **v} for k, v in schools.items()]
    elif isinstance(schools, list):
        return schools
    return []


def safe_extract(data, path, default=np.nan):
    current = data
    for key in path:
        if isinstance(current, dict):
            current = current.get(key, default)
        else:
            return default
    return current


def extract_features(entry):
    features = {}

    schools = normalize_schools(entry.get("Schools", []))
    features["num_applications"] = len(schools)

    demo = entry.get("Demographics", {})
    features.update({
        "gender": str(demo.get("Gender", "missing")).lower(),
        "race": str(demo.get("Race/Ethnicity", "missing")).lower(),
        "school_type": str(demo.get("Type of School", "missing")).lower()
    })

    acad = entry.get("Academics", {})

    for test in ["ACT", "SAT"]:
        composite = safe_extract(acad, [test, "Composite"]) or acad.get(test)
        try:
            composite_str = str(composite)
            composite_num = re.sub(r"[^\d.]", "", composite_str)
            features[test.lower()] = float(composite_num) if composite_num else np.nan
        except Exception:
            features[test.lower()] = np.nan

    gpa_data = acad.get("UW/W GPA", {})
    if isinstance(gpa_data, dict):
        try:
            features["unweighted_gpa"] = float(gpa_data.get("UW", np.nan))
        except Exception:
            features["unweighted_gpa"] = np.nan
        try:
            features["weighted_gpa"] = float(gpa_data.get("W", np.nan))
        except Exception:
            features["weighted_gpa"] = np.nan
    else:
        try:
            features["weighted_gpa"] = float(gpa_data) if gpa_data and gpa_data not in ["N/A", ""] else np.nan
            features["unweighted_gpa"] = np.nan
        except Exception:
            features["weighted_gpa"] = np.nan
            features["unweighted_gpa"] = np.nan

    coursework = acad.get("Coursework", "")
    if isinstance(coursework, list):
        features["coursework"] = " ".join(coursework)
    else:
        features["coursework"] = str(coursework)

    ecs = entry.get("Extracurriculars", [])
    if not isinstance(ecs, list):
        ecs = [str(ecs)] if ecs not in [None, "N/A", ""] else []
    features["extracurriculars"] = " | ".join([str(e).strip() for e in ecs if e]) or "none"
    features["num_extracurriculars"] = len(ecs)

    return features


class BertVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='distilbert-base-uncased', max_length=128, batch_size=8):
        self.model_name = model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.tokenizer = None
        self.model = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.cache = {}  # Cache to store already computed embeddings

    def _lazy_init(self):
        if self.tokenizer is None or self.model is None:
            self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)
            self.model = DistilBertModel.from_pretrained(self.model_name)
            self.model.to(self.device)
            self.model.eval()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        self._lazy_init()

        if hasattr(X, 'tolist'):
            X = X.tolist()
        else:
            X = list(X)

        X = [str(text) for text in X]

        # Check cache first; if all texts are cached, return stacked results
        new_texts = [text for text in X if text not in self.cache]
        cached_embeddings = [self.cache[text] for text in X if text in self.cache]

        # If there are texts that are not cached, compute them in batches
        if new_texts:
            new_embeddings = []
            for i in range(0, len(new_texts), self.batch_size):
                batch_texts = new_texts[i: i + self.batch_size]
                encoded = self.tokenizer(
                    batch_texts,
                    padding=True,
                    truncation=True,
                    max_length=self.max_length,
                    return_tensors='pt'
                )
                encoded = {k: v.to(self.device) for k, v in encoded.items()}
                with torch.no_grad():
                    outputs = self.model(**encoded)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                new_embeddings.append(batch_embeddings)
            new_embeddings = np.vstack(new_embeddings)
            # Store new embeddings in cache
            for text, embedding in zip(new_texts, new_embeddings):
                self.cache[text] = embedding
        else:
            new_embeddings = np.empty((0, self.model.config.hidden_size))

        # Reconstruct embeddings in the original order
        final_embeddings = []
        for text in X:
            final_embeddings.append(self.cache[text])
        return np.vstack(final_embeddings)


def train_model(training_data, n_trials=2):
    app_data = []
    for entry in training_data:
        base_features = extract_features(entry)
        schools = normalize_schools(entry.get("Schools", []))
        for school in schools:
            features = base_features.copy()
            features.update({
                "university": school.get("University", "unknown").lower(),
                "application_type": school.get("ED/EA/RD", "RD").lower(),
                "decision": 1 if "accepted" in str(school.get("Status", "")).lower() else 0
            })
            app_data.append(features)

    df = pd.DataFrame(app_data)
    if df.empty:
        raise ValueError("No training data available")

    numeric_features = ['act', 'sat', 'weighted_gpa', 'unweighted_gpa', 'num_applications', 'num_extracurriculars']
    categorical_features = ['gender', 'race', 'school_type', 'application_type']
    text_features_coursework = 'coursework'
    text_features_ec = 'extracurriculars'

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    bert_pipeline_coursework = Pipeline([
        ('bert', BertVectorizer(max_length=128, batch_size=8))
    ])

    bert_pipeline_ec = Pipeline([
        ('bert', BertVectorizer(max_length=128, batch_size=8))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
        ('course', bert_pipeline_coursework, text_features_coursework),
        ('ec', bert_pipeline_ec, text_features_ec)
    ], remainder='drop')

    random_seed = 42
    meta_estimator = LogisticRegression(max_iter=1000, random_state=random_seed)

    X = df.drop(columns=['decision'])
    y = df['decision']

    def objective(trial):
        xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 100, 300)
        xgb_max_depth = trial.suggest_int('xgb_max_depth', 3, 8)
        xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.2, log=True)

        lgb_n_estimators = trial.suggest_int('lgb_n_estimators', 100, 300)
        lgb_max_depth = trial.suggest_int('lgb_max_depth', 3, 8)
        lgb_learning_rate = trial.suggest_float('lgb_learning_rate', 0.01, 0.2, log=True)

        xgb_estimator = xgb.XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            n_estimators=xgb_n_estimators,
            max_depth=xgb_max_depth,
            learning_rate=xgb_learning_rate,
            random_state=random_seed
        )
        lgb_estimator = lgb.LGBMClassifier(
            n_estimators=lgb_n_estimators,
            max_depth=lgb_max_depth,
            learning_rate=lgb_learning_rate,
            random_state=random_seed,
            verbose=-1,            # Suppress warnings
            min_split_gain=0.1       # Increase minimum gain to reduce unnecessary splits
        )
        base_estimators = [
            ('xgb', xgb_estimator),
            ('lgb', lgb_estimator)
        ]

        stacking_model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_estimator,
            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed),
            n_jobs=-1
        )

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', stacking_model)
        ])

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
        mean_score = np.mean(scores)
        trial.set_user_attr("mean_cv_score", mean_score)
        return mean_score

    print("Starting hyperparameter optimization using Optuna...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs=1)

    print("Best trial:")
    print("  Value (mean CV ROC AUC):", study.best_trial.value)
    print("  Params:", study.best_trial.params)

    best_params = study.best_trial.params

    best_xgb = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=best_params['xgb_n_estimators'],
        max_depth=best_params['xgb_max_depth'],
        learning_rate=best_params['xgb_learning_rate'],
        random_state=random_seed
    )
    best_lgb = lgb.LGBMClassifier(
        n_estimators=best_params['lgb_n_estimators'],
        max_depth=best_params['lgb_max_depth'],
        learning_rate=best_params['lgb_learning_rate'],
        random_state=random_seed,
        verbose=-1,
        min_split_gain=0.1
    )
    base_estimators = [
        ('xgb', best_xgb),
        ('lgb', best_lgb)
    ]

    best_stacking = StackingClassifier(
        estimators=base_estimators,
        final_estimator=meta_estimator,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed),
        n_jobs=-1
    )

    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', best_stacking)
    ])

    print("Training the final model on the full dataset...")
    best_pipeline.fit(X, y)

    joblib.dump(best_pipeline, 'admission_model_advanced.pkl')
    print("Advanced model trained and saved as 'admission_model_advanced.pkl'")

    return best_pipeline, study


def predict_new_application(new_app_file):
    try:
        model = joblib.load('admission_model_advanced.pkl')
    except FileNotFoundError:
        raise RuntimeError("Advanced model not found. Please train the model first with training data.")

    with open(new_app_file) as f:
        new_data = json.load(f)

    results = {}
    base_features = extract_features(new_data)
    schools = normalize_schools(new_data.get("Schools", []))

    for school in schools:
        try:
            features = base_features.copy()
            features.update({
                "university": school.get("University", "unknown").lower(),
                "application_type": school.get("ED/EA/RD", "RD").lower()
            })
            df_new = pd.DataFrame([features])
            proba = model.predict_proba(df_new)[0][1]
            recommendation = (
                'Safety' if proba >= 0.7
                else 'Reach' if proba <= 0.3
                else 'Target'
            )
            results[school["University"]] = {
                'probability': round(proba, 3),
                'recommendation': recommendation
            }
        except Exception as e:
            results[school["University"]] = {'error': str(e)}

    return results


if __name__ == "__main__":
    import sys
    if len(sys.argv) == 1:
        print("Training advanced model with provided data...")
        if not os.path.exists("applications.json"):
            raise FileNotFoundError("Training data file 'applications.json' not found.")
        with open("applications.json") as f:
            training_data = json.load(f)
        final_model, study = train_model(training_data, n_trials=3)
    elif len(sys.argv) == 2:
        new_app_file = sys.argv[1]
        print(f"Making predictions for {new_app_file}...")
        results = predict_new_application(new_app_file)
        print("\nPredictions:")
        for school, pred in results.items():
            print(f"{school}: {pred}")
    else:
        print("Usage:")
        print("  Training: python script.py")
        print("  Prediction: python script.py new_application.json")


Overwriting script.py


In [None]:
!python script.py

2025-03-19 21:54:27.107124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742421267.127250    1307 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742421267.133378    1307 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 21:54:27.154302: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Training advanced model with provided data...
Starting hyperparameter optimization using Optuna...
[32m[I 2025-03-19

In [None]:
!python

In [None]:
%%writefile new_application.json

{
  "Schools": [
    {
      "University": "Harvard University",
      "ED/EA/RD": "RD"
    },
    {
      "University": "Stanford University",
      "ED/EA/RD": "RD"
    }
  ],
  "Demographics": {
    "Gender": "Female",
    "Race/Ethnicity": "Asian",
    "Type of School": "Public"
  },
  "Academics": {
    "ACT": {
      "Composite": "34"
    },
    "SAT": {
      "Composite": "1520"
    },
    "UW/W GPA": {
      "UW": "3.95",
      "W": "4.5"
    },
    "Coursework": [
      "AP Calculus BC",
      "AP Physics C",
      "AP English Literature",
      "AP U.S. History",
      "AP Computer Science A"
    ]
  },
  "Extracurriculars": [
    "President of Science Club",
    "Varsity Tennis Team Captain",
    "Volunteer at Local Hospital",
    "Piano - State Level Competitions",
    "Internship at Tech Startup"
  ]
}


Writing new_application.json


In [None]:
!python script.py new_application.json

2025-03-19 23:20:14.511478: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742426414.531473   22356 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742426414.537601   22356 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 23:20:14.557767: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Making predictions for new_application.json...

Predictions:
Harvard University: {'probability': np.float64(0.209), '

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
