In [3]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings('ignore')
sns.set_theme()


In [10]:
DATA_PATH = 'D:/ML_Project/Project1/sample_train_eval/PROJECT_CLUSTERED_SEVERITY_DATA.csv'  # Update for your chosen file
df = pd.read_csv(DATA_PATH)
print(df.shape)
print(df.columns)


(249048, 28)
Index(['primaryid', 'age', 'fda_dt_parsed', 'is_severe_outcome', 'drug_count',
       'indication_count', 'all_reaction_pts', 'reaction_count',
       'is_ineffective', 'is_failure', 'rept_cod_EXP', 'rept_cod_PER', 'sex_M',
       'sex_UNK', 'occp_cod_LW', 'occp_cod_MD', 'occp_cod_OT', 'occp_cod_PH',
       'occp_cod_UNK', 'reporter_country_COUNTRY NOT SPECIFIED',
       'reporter_country_GB', 'reporter_country_JP', 'reporter_country_OTHER',
       'reporter_country_US', 'severity_category', 'severity_weight',
       'failure_phenotype', 'failure_phenotype_label'],
      dtype='object')


In [15]:
drop_cols = ['primaryid']  # Add others if you do not want them in modeling
drop_cols = [c for c in drop_cols if c in df.columns]
df = df.drop(columns=drop_cols)


In [18]:

label_cols = ['failure_phenotype_label_Cluster_1', 'failure_phenotype_label_Cluster_2']

# Create a single multiclass target column from one-hot cluster indicators
df['target_label'] = df[label_cols].idxmax(axis=1)  # Gets the column name with maximum (assumes one-hot encoding)
df['target_label'] = df['target_label'].str.replace('failure_phenotype_label_', '')  # Extract only cluster number

# Input features: Drop the one-hot label columns and new target label
X = df.drop(columns=label_cols + ['target_label'])
y = df['target_label']

# Encode target as integers for ML models
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(X_train.shape, X_test.shape)


(199238, 26) (49810, 26)


In [20]:
from sklearn.preprocessing import StandardScaler

numeric_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
if numeric_cols:
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logreg = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42)
logreg.fit(X_train_scaled, y_train)
y_pred_logreg = logreg.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45856
           1       1.00      1.00      1.00      3954

    accuracy                           1.00     49810
   macro avg       1.00      1.00      1.00     49810
weighted avg       1.00      1.00      1.00     49810



In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45856
           1       1.00      1.00      1.00      3954

    accuracy                           1.00     49810
   macro avg       1.00      1.00      1.00     49810
weighted avg       1.00      1.00      1.00     49810

