In [1]:
# Cell 1 - Imports & configuration
import os
import joblib
import pandas as pd
import numpy as np

# Paths (edit if needed)
BASE_PATH = r"D:\ML_Project\data\processed"
os.makedirs(BASE_PATH, exist_ok=True)

FINAL_MASTER = os.path.join(BASE_PATH, "FINAL_MASTER_DATASET.csv")

# Outputs
SAMPLE_TRAIN_EVAL = os.path.join(BASE_PATH, "sample_train_eval.csv")
SAMPLE_REALWORLD_TEST = os.path.join(BASE_PATH, "sample_realworld_test.csv")

# Auxiliary artifact (list of final feature columns used for training/classification)
FEATURE_LIST_PATH = os.path.join(r"D:\ML_Project\models", "training_features.joblib")
os.makedirs(os.path.dirname(FEATURE_LIST_PATH), exist_ok=True)

# Sampling / options
TRAIN_SAMPLE_FRAC = 0.25  # 25% → clustering + evaluation
TEST_SAMPLE_FRAC = 0.05  # 5% → real-world prediction test
RANDOM_STATE = 42

# Behavior flags
OVERWRITE_EXISTING_SAMPLES = True  # if False and files exist, will not overwrite
SAVE_FEATURE_LIST = True  # save final feature column list for later inference
INPLACE_MODIFY_MASTER = (
    False  # If True: will write back the cleaned MASTER file (use with caution)
)

print("Config:")
print(" FINAL_MASTER =", FINAL_MASTER)
print(" OUTPUTS ->", SAMPLE_TRAIN_EVAL, SAMPLE_REALWORLD_TEST)
print(" FEATURE_LIST ->", FEATURE_LIST_PATH)

Config:
 FINAL_MASTER = D:\ML_Project\data\processed\FINAL_MASTER_DATASET.csv
 OUTPUTS -> D:\ML_Project\data\processed\sample_train_eval.csv D:\ML_Project\data\processed\sample_realworld_test.csv
 FEATURE_LIST -> D:\ML_Project\models\training_features.joblib


In [2]:
# Cell 2 - Load FINAL_MASTER safely
if not os.path.exists(FINAL_MASTER):
    raise FileNotFoundError(f"FINAL_MASTER not found at {FINAL_MASTER}")

print("Loading FINAL_MASTER_DATASET (this may take some memory)...")
df = pd.read_csv(FINAL_MASTER, low_memory=False)
print(f"Loaded: {len(df):,} rows × {len(df.columns)} columns")

# Quick glance (first 2 rows) for verification
display(df.head(2))

Loading FINAL_MASTER_DATASET (this may take some memory)...
Loaded: 2,818,644 rows × 34 columns


Unnamed: 0,primaryid,caseid,caseversion,i_f_code,event_dt,mfr_dt,init_fda_dt,fda_dt,rept_cod,auth_num,...,occr_country,fda_dt_parsed,is_severe_outcome,drug_count,indication_count,all_reaction_pts,reaction_count,is_ineffective,therapy_duration_days,is_failure
0,33538582,3353858,2,F,20081126.0,20160225.0,19990716,20160307,EXP,,...,FR,2016-03-07,1,6,6,ESCHERICHIA SEPSIS ESCHERICHIA SEPSIS,2,0,,1
1,33568652,3356865,2,F,20080113.0,20150223.0,19990716,20160303,PER,,...,FR,2016-03-03,1,10,6,CARDIAC FAILURE CARDIAC FAILURE,2,0,,1


In [3]:
# Cell 3 - Drop non-predictive / leaking columns
cols_to_drop = [
    # FAERS metadata
    "caseid",
    "caseversion",
    "i_f_code",
    "auth_num",
    "mfr_num",
    "mfr_sndr",
    "lit_ref",
    "e_sub",
    "to_mfr",
    "occr_country",
    # date fields that leak event timing
    "event_dt",
    "mfr_dt",
    "init_fda_dt",
    "fda_dt",
    "rept_dt",
    # ambiguous or inconsistent
    "age_cod",
    "age_grp",
    "wt_cod",
    "wt",
    "therapy_duration_days",
]

cols_present = [c for c in cols_to_drop if c in df.columns]
print("Dropping columns (if present):", cols_present)
df.drop(columns=cols_present, inplace=True, errors="ignore")
print("Shape after drop:", df.shape)

Dropping columns (if present): ['caseid', 'caseversion', 'i_f_code', 'auth_num', 'mfr_num', 'mfr_sndr', 'lit_ref', 'e_sub', 'to_mfr', 'occr_country', 'event_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_dt', 'age_cod', 'age_grp', 'wt_cod', 'wt', 'therapy_duration_days']
Shape after drop: (2818644, 14)


In [4]:
# Cell 4 - Clean numeric + categorical
# AGE
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    median_age = float(df["age"].median(skipna=True))
    df["age"].fillna(median_age, inplace=True)
    print(f"Filled missing age with median: {median_age:.2f}")
else:
    print("Column 'age' not present.")

# SEX & OCCUPATION
for col in ["sex", "occp_cod"]:
    if col in df.columns:
        df[col] = df[col].fillna("UNK")
        df[col] = df[col].astype(str)
        print(f"Filled missing in {col} -> 'UNK'")

# REPORTER COUNTRY: drop missing then compress to top-5
if "reporter_country" in df.columns:
    before = len(df)
    df = df.dropna(subset=["reporter_country"])
    print(f"Dropped {before - len(df)} rows missing 'reporter_country'")
    top5 = df["reporter_country"].value_counts().head(5).index.tolist()
    df["reporter_country"] = df["reporter_country"].apply(
        lambda x: x if x in top5 else "OTHER"
    )
    print("Reporter country compressed to Top-5 + OTHER:", top5)
else:
    print("'reporter_country' not present; skipping compression")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(median_age, inplace=True)


Filled missing age with median: 59.00
Filled missing in sex -> 'UNK'
Filled missing in occp_cod -> 'UNK'
Dropped 8 rows missing 'reporter_country'
Reporter country compressed to Top-5 + OTHER: ['US', 'COUNTRY NOT SPECIFIED', 'GB', 'JP', 'CA']


In [5]:
# Cell 5 - Clean 'all_reaction_pts' text
if "all_reaction_pts" not in df.columns:
    raise KeyError("Missing required column 'all_reaction_pts' in FINAL_MASTER")

# normalize text: lower, remove non-letters, collapse spaces
df["all_reaction_pts"] = (
    df["all_reaction_pts"]
    .astype(str)
    .str.lower()
    .str.replace(r"[^a-z\s]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

before = len(df)
df = df[df["all_reaction_pts"].str.len() > 0].copy()
print(f"Removed {before - len(df)} rows with empty/blank 'all_reaction_pts'")

Removed 0 rows with empty/blank 'all_reaction_pts'


In [6]:
# Cell 6 - Invalid ages
if "age" in df.columns:
    before = len(df)
    df = df[(df["age"] >= 0) & (df["age"] <= 120)].copy()
    print(f"Removed {before - len(df)} rows with invalid ages")

# Verify target exists
if "is_failure" not in df.columns:
    raise KeyError(
        "'is_failure' target column not found in FINAL_MASTER after cleaning."
    )
print(
    "Target 'is_failure' found. Unique values:",
    df["is_failure"].value_counts(dropna=False).to_dict(),
)

print("Final shape after cleaning:", df.shape)

Removed 11977 rows with invalid ages
Target 'is_failure' found. Unique values: {0: 1811619, 1: 995040}
Final shape after cleaning: (2806659, 14)


In [7]:
# Cell 7 - One-hot encode categorical features (idempotent)
categorical_cols = ["rept_cod", "sex", "occp_cod", "reporter_country"]
existing = [c for c in categorical_cols if c in df.columns]

if existing:
    print("One-hot encoding:", existing)
    df_encoded = pd.get_dummies(df, columns=existing, drop_first=True)
else:
    print("No categorical cols to encode from", categorical_cols)
    df_encoded = df.copy()

print("Shape after encoding:", df_encoded.shape)

# Optional: save feature list (columns used for modeling) - exclude target and text field
feature_cols = [
    c for c in df_encoded.columns if c not in ("is_failure", "all_reaction_pts")
]
if SAVE_FEATURE_LIST:
    joblib.dump(feature_cols, FEATURE_LIST_PATH)
    print("Saved feature list (len={}):".format(len(feature_cols)), FEATURE_LIST_PATH)

One-hot encoding: ['rept_cod', 'sex', 'occp_cod', 'reporter_country']
Shape after encoding: (2806659, 24)
Saved feature list (len=22): D:\ML_Project\models\training_features.joblib


In [8]:
# Cell 8 - Shuffle and sample
df_shuffled = df_encoded.sample(frac=1, random_state=RANDOM_STATE).reset_index(
    drop=True
)

# TRAIN/EVAL sample
if not os.path.exists(SAMPLE_TRAIN_EVAL) or OVERWRITE_EXISTING_SAMPLES:
    train_eval = df_shuffled.sample(frac=TRAIN_SAMPLE_FRAC, random_state=RANDOM_STATE)
    train_eval.to_csv(SAMPLE_TRAIN_EVAL, index=False)
    print("Saved Train/Eval sample ->", SAMPLE_TRAIN_EVAL)
else:
    print("Train/Eval sample exists and overwrite disabled:", SAMPLE_TRAIN_EVAL)

# REALWORLD sample from remaining rows
remaining = df_shuffled.drop(train_eval.index)
real_frac_adjusted = TEST_SAMPLE_FRAC / (1 - TRAIN_SAMPLE_FRAC)
if not os.path.exists(SAMPLE_REALWORLD_TEST) or OVERWRITE_EXISTING_SAMPLES:
    realworld_test = remaining.sample(frac=real_frac_adjusted, random_state=99)
    realworld_test.to_csv(SAMPLE_REALWORLD_TEST, index=False)
    print("Saved Real-world test sample ->", SAMPLE_REALWORLD_TEST)
else:
    print("Real-world sample exists and overwrite disabled:", SAMPLE_REALWORLD_TEST)

print("\nSampling Summary:")
print(f"  Train/Eval (frac={TRAIN_SAMPLE_FRAC}): {len(train_eval):,}")
print(
    f"  Real-world test (approx frac={TEST_SAMPLE_FRAC} overall): {len(realworld_test):,}"
)

Saved Train/Eval sample -> D:\ML_Project\data\processed\sample_train_eval.csv
Saved Real-world test sample -> D:\ML_Project\data\processed\sample_realworld_test.csv

Sampling Summary:
  Train/Eval (frac=0.25): 701,665
  Real-world test (approx frac=0.05 overall): 140,333


In [9]:
# Cell 10 - Final summary & checks
print("Final dataset shape:", df_encoded.shape)
print("Columns (sample):", df_encoded.columns.tolist()[:40])
print("\nTarget distribution in original full cleaned frame:")
print(df_encoded["is_failure"].value_counts(dropna=False))

# Sanity: confirm feature list length matches columns saved
if SAVE_FEATURE_LIST and os.path.exists(FEATURE_LIST_PATH):
    loaded_feats = joblib.load(FEATURE_LIST_PATH)
    print("\nSaved feature list length:", len(loaded_feats))
    # Ensure all saved features are present in df_encoded
    missing = [c for c in loaded_feats if c not in df_encoded.columns]
    if missing:
        print(
            "WARNING: feature list contains columns not present in cleaned df:",
            missing[:10],
        )
    else:
        print("Feature list verified against cleaned dataframe.")

Final dataset shape: (2806659, 24)
Columns (sample): ['primaryid', 'age', 'fda_dt_parsed', 'is_severe_outcome', 'drug_count', 'indication_count', 'all_reaction_pts', 'reaction_count', 'is_ineffective', 'is_failure', 'rept_cod_EXP', 'rept_cod_PER', 'sex_M', 'sex_UNK', 'occp_cod_LW', 'occp_cod_MD', 'occp_cod_OT', 'occp_cod_PH', 'occp_cod_UNK', 'reporter_country_COUNTRY NOT SPECIFIED', 'reporter_country_GB', 'reporter_country_JP', 'reporter_country_OTHER', 'reporter_country_US']

Target distribution in original full cleaned frame:
is_failure
0    1811619
1     995040
Name: count, dtype: int64

Saved feature list length: 22
Feature list verified against cleaned dataframe.
