In [6]:
# ==============================================================
# PROJECT: Therapeutic Failure Phenotype Discovery
# PHASE: Data Cleaning and Sampling (Minimal Version)
# FILE: 02_preprocessing_MINIMAL.ipynb
# ==============================================================

import os
import pandas as pd
import numpy as np

In [7]:
# --------------------------------------------------------------
# 1) PATHS AND CONFIG
# --------------------------------------------------------------
BASE_PATH = r"D:\ML_Project\data\processed"

FINAL_MASTER = os.path.join(BASE_PATH, "FINAL_MASTER_DATASET.csv")

# ONLY TWO OUTPUTS
SAMPLE_TRAIN_EVAL = os.path.join(BASE_PATH, "sample_train_eval.csv")
SAMPLE_REALWORLD_TEST = os.path.join(BASE_PATH, "sample_realworld_test.csv")

TRAIN_SAMPLE_FRAC = 0.25      # 25% → clustering + evaluation
TEST_SAMPLE_FRAC = 0.05       # 5% → real-world prediction test

# --------------------------------------------------------------
# 2) LOAD FINAL MASTER
# --------------------------------------------------------------
print("\n Loading FINAL_MASTER_DATASET...")
df = pd.read_csv(FINAL_MASTER, low_memory=False)
print(f" Loaded: {len(df):,} rows | {len(df.columns)} cols")


 Loading FINAL_MASTER_DATASET...
 Loaded: 2,818,644 rows | 34 cols


In [8]:
# --------------------------------------------------------------
# 3) REMOVE NON-PREDICTIVE / ADMINISTRATIVE / TIMING COLUMNS
# --------------------------------------------------------------
cols_to_drop = [
    # FAERS metadata
    "caseid","caseversion","i_f_code","auth_num","mfr_num","mfr_sndr",
    "lit_ref","e_sub","to_mfr","occr_country",

    # date fields that leak event timing
    "event_dt","mfr_dt","init_fda_dt","fda_dt","rept_dt",

    # ambiguous or inconsistent
    "age_cod","age_grp","wt_cod","wt","therapy_duration_days"
]

df.drop(columns=[c for c in cols_to_drop if c in df.columns],
        inplace=True, errors="ignore")

print("After dropping metadata cols →", df.shape)

# --------------------------------------------------------------
# 4) CLEAN BASIC NUMERIC + CATEGORICAL
# --------------------------------------------------------------
# AGE
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    median_age = df["age"].median()
    df["age"].fillna(median_age, inplace=True)
    print("Filled missing age with median:", median_age)

# SEX & OCCUPATION
for col in ["sex", "occp_cod"]:
    if col in df.columns:
        df[col] = df[col].fillna("UNK")

# COUNTRY — drop missing + compress to top-5
if "reporter_country" in df.columns:
    df = df.dropna(subset=["reporter_country"])
    top5 = df["reporter_country"].value_counts().head(5).index.tolist()
    df["reporter_country"] = df["reporter_country"].apply(
        lambda x: x if x in top5 else "OTHER"
    )
    print("Reporter country compressed to Top-5 + OTHER:", top5)

After dropping metadata cols → (2818644, 14)
Filled missing age with median: 59.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(median_age, inplace=True)


Reporter country compressed to Top-5 + OTHER: ['US', 'COUNTRY NOT SPECIFIED', 'GB', 'JP', 'CA']


In [9]:
# --------------------------------------------------------------
# 5) CLEAN REACTION TEXT
# --------------------------------------------------------------
df["all_reaction_pts"] = (
    df["all_reaction_pts"]
      .astype(str)
      .str.lower()
      .str.replace(r"[^a-z\s]", " ", regex=True)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

before = len(df)
df = df[df["all_reaction_pts"].str.len() > 0]
print("Removed empty reaction rows:", before - len(df))

# --------------------------------------------------------------
# 6) REMOVE INVALID AGE VALUES
# --------------------------------------------------------------
before = len(df)
df = df[(df["age"] >= 0) & (df["age"] <= 120)]
print("Removed invalid ages:", before - len(df))

# --------------------------------------------------------------
# 7) ONE-HOT ENCODE CATEGORICAL FEATURES
# --------------------------------------------------------------
categorical_cols = ["rept_cod", "sex", "occp_cod", "reporter_country"]
existing = [c for c in categorical_cols if c in df.columns]

df = pd.get_dummies(df, columns=existing, drop_first=True)
print("Encoded categorical:", existing)
print("Shape after encoding:", df.shape)

# --------------------------------------------------------------
# 8) VERIFY TARGET EXISTS
# --------------------------------------------------------------
if "is_failure" not in df.columns:
    raise KeyError("'is_failure' missing — check FINAL_MASTER_DATASET.")


Removed empty reaction rows: 0
Removed invalid ages: 11977
Encoded categorical: ['rept_cod', 'sex', 'occp_cod', 'reporter_country']
Shape after encoding: (2806659, 24)


In [10]:
# --------------------------------------------------------------
# 9) SHUFFLE + CREATE TRAIN/EVAL + REAL-WORLD TEST SAMPLES
# --------------------------------------------------------------
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_eval = df.sample(frac=TRAIN_SAMPLE_FRAC, random_state=42)

remaining = df.drop(train_eval.index)
realworld_test = remaining.sample(
    frac=TEST_SAMPLE_FRAC / (1 - TRAIN_SAMPLE_FRAC),
    random_state=99
)

train_eval.to_csv(SAMPLE_TRAIN_EVAL, index=False)
realworld_test.to_csv(SAMPLE_REALWORLD_TEST, index=False)

print("\n Sampling Summary:")
print(f"  Train/Eval (25%):      {len(train_eval):,}")
print(f"  Real-world Test (5%):  {len(realworld_test):,}")

# --------------------------------------------------------------
# 10 FINAL SUMMARY
# --------------------------------------------------------------
print("\n PREPROCESSING COMPLETE")
print("Files created:")
print(" →", SAMPLE_TRAIN_EVAL)
print(" →", SAMPLE_REALWORLD_TEST)

print("\nTarget Distribution (is_failure):")
print(df["is_failure"].value_counts())


 Sampling Summary:
  Train/Eval (25%):      701,665
  Real-world Test (5%):  140,333

 PREPROCESSING COMPLETE
Files created:
 → D:\ML_Project\data\processed\sample_train_eval.csv
 → D:\ML_Project\data\processed\sample_realworld_test.csv

Target Distribution (is_failure):
is_failure
0    1811619
1     995040
Name: count, dtype: int64
