In [3]:
import pandas as pd
import glob
import os

data_path = r"D:\Python_practice\group-project-ml\data\raw"

In [4]:
file_types = ["DEMO", "DRUG", "INDI", "OUTC", "REAC", "RPSR", "THER"]
years = range(2015, 2026)
quarters = ["Q1", "Q2", "Q3", "Q4"]

# (MASTER_SCHEMAS dictionary is the same as before, omitted for brevity)
MASTER_SCHEMAS = {
    "DEMO": {
        "columns": [
            "primaryid",
            "caseid",
            "caseversion",
            "i_f_code",
            "event_dt",
            "mfr_dt",
            "init_fda_dt",
            "fda_dt",
            "rept_cod",
            "auth_num",
            "mfr_num",
            "mfr_sndr",
            "lit_ref",
            "age",
            "age_cod",
            "age_grp",
            "sex",
            "e_sub",
            "wt",
            "wt_cod",
            "rept_dt",
            "to_mfr",
            "occp_cod",
            "reporter_country",
            "occr_country",
        ],
        "rename_map": {},
    },
    "DRUG": {
        "columns": [
            "primaryid",
            "caseid",
            "drug_seq",
            "role_cod",
            "drugname",
            "prod_ai",
            "val_vbm",
            "route",
            "dose_vbm",
            "cum_dose_chr",
            "cum_dose_unit",
            "dechal",
            "rechal",
            "lot_num",
            "exp_dt",
            "nda_num",
            "dose_amt",
            "dose_unit",
            "dose_form",
            "dose_freq",
        ],
        "rename_map": {},
    },
    "INDI": {
        "columns": ["primaryid", "caseid", "indi_drug_seq", "indi_pt"],
        "rename_map": {},
    },
    "OUTC": {"columns": ["primaryid", "caseid", "outc_cod"], "rename_map": {}},
    "REAC": {
        "columns": ["primaryid", "caseid", "pt", "drug_rec_act"],
        "rename_map": {},
    },
    "RPSR": {"columns": ["primaryid", "caseid", "rpsr_cod"], "rename_map": {}},
    "THER": {
        "columns": [
            "primaryid",
            "caseid",
            "dsg_drug_seq",
            "start_dt",
            "end_dt",
            "dur",
            "dur_cod",
        ],
        "rename_map": {},
    },
}
# --- End Configuration ---


def process_and_align_file(filepath, master_columns, rename_map):
    """
    Loads a single quarterly file, renames columns, and aligns
    it to the master schema.
    """
    try:
        df = pd.read_csv(
            filepath, sep="$", dtype=str, on_bad_lines="skip", engine="python"
        )

        df.columns = df.columns.str.lower()
        df.rename(columns=rename_map, inplace=True)

        extra_cols = set(df.columns) - set(master_columns)
        if extra_cols:
            df.drop(columns=list(extra_cols), inplace=True)

        missing_cols = set(master_columns) - set(master_columns)
        if missing_cols:
            for col in missing_cols:
                df[col] = pd.NA

        df = df[master_columns]
        return df

    except Exception as e:
        print(f"  ‚ùå Error processing {os.path.basename(filepath)}: {e}")
        return None


# --- Main Execution ---

for ftype in file_types:
    print(f"\nüîç Processing {ftype} files...")

    master_cols = MASTER_SCHEMAS[ftype]["columns"]
    rename_map = MASTER_SCHEMAS[ftype]["rename_map"]

    out_file = os.path.join(data_path, f"MASTER_{ftype}.csv")
    is_first_file = True

    for year in years:
        for q in quarters:
            if year == 2025 and q == "Q4":
                break

            pattern_upper = os.path.join(
                data_path, f"{ftype.upper()}{str(year)[-2:]}{q.upper()}.txt"
            )
            matches = glob.glob(pattern_upper)

            if not matches:
                continue

            for file in matches:
                # This filter will correctly skip the PDFs
                if not file.lower().endswith(".txt"):
                    continue

                print(f"  Processing: {os.path.basename(file)}...")

                df = process_and_align_file(file, master_cols, rename_map)

                if df is not None and not df.empty:
                    if is_first_file:
                        df.to_csv(out_file, index=False, mode="w")
                        is_first_file = False
                    else:
                        df.to_csv(out_file, index=False, mode="a", header=False)

    if is_first_file:
        print(f"  No valid .txt files found or processed for {ftype}")
    else:
        print(f"Saved merged file: {out_file}")

print("\nAll processing complete.")


üîç Processing DEMO files...
  Processing: DEMO15Q1.txt...
  Processing: DEMO15Q2.txt...
  Processing: DEMO15Q3.txt...
  Processing: DEMO15Q4.txt...
  Processing: DEMO16Q1.txt...
  Processing: DEMO16Q2.txt...
  Processing: DEMO16Q3.txt...
  Processing: DEMO16Q4.txt...
  Processing: DEMO17Q1.txt...
  Processing: DEMO17Q2.txt...
  Processing: DEMO17Q3.txt...
  Processing: DEMO17Q4.txt...
  Processing: DEMO18Q1.txt...
  Processing: DEMO18Q2.txt...
  Processing: DEMO18Q3.txt...
  Processing: DEMO18Q4.txt...
  Processing: DEMO19Q1.txt...
  Processing: DEMO19Q3.txt...
  Processing: DEMO19Q4.txt...
  Processing: DEMO20Q1.txt...
  Processing: DEMO20Q2.txt...
  Processing: DEMO20Q3.txt...
  Processing: DEMO20Q4.txt...
  Processing: DEMO21Q1.txt...
  Processing: DEMO21Q2.txt...
  Processing: DEMO21Q3.txt...
  Processing: DEMO21Q4.txt...
  Processing: DEMO22Q1.txt...
  Processing: DEMO22Q2.txt...
  Processing: DEMO22Q3.txt...
  Processing: DEMO22Q4.txt...
  Processing: DEMO23Q1.txt...
  Process

In [5]:
MASTER_DEMO_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_DEMO.csv"
# ---------------------

print(f"Loading {MASTER_DEMO_FILE} to check for duplicates...")
print("This may take a moment...")

try:
    # Load only the 'caseid' column to save memory
    df_demo = pd.read_csv(MASTER_DEMO_FILE, usecols=["caseid"], dtype={"caseid": "str"})

    print(f"\nTotal records loaded: {len(df_demo)}")

    # --- The Check You Suggested ---

    # 1. Count how many times each caseid appears
    caseid_counts = df_demo["caseid"].value_counts()

    # 2. Filter this list to show only the duplicates
    #    (where the count is greater than 1)
    duplicate_caseids = caseid_counts[caseid_counts > 1]

    # --- Report the Findings ---

    print("\n--- Duplicate Report ---")
    if duplicate_caseids.empty:
        print("‚úÖ No duplicate caseids found!")
    else:
        print(
            f"üö® Found {len(duplicate_caseids)} unique cases that have duplicates (follow-ups)."
        )
        print("\nTop 50 most frequent cases (caseid: number_of_versions):")

        # Sort by count (descending) and print the top 50
        print(duplicate_caseids.sort_values(ascending=False).head(50))

        total_duplicate_rows = duplicate_caseids.sum()
        unique_rows = len(caseid_counts)
        print("\n--- Summary ---")
        print(f"Total rows in your file:  {len(df_demo)}")
        print(f"Unique patient stories: {unique_rows}")
        print(
            f"Total rows that are duplicates: {total_duplicate_rows - len(duplicate_caseids)}"
        )

except FileNotFoundError:
    print(f"Error: File not found at {MASTER_DEMO_FILE}")
except Exception as e:
    print(f"An error occurred: {e}")

Loading D:\Python_practice\group-project-ml\data\raw\MASTER_DEMO.csv to check for duplicates...
This may take a moment...

Total records loaded: 16935987

--- Duplicate Report ---
üö® Found 1745993 unique cases that have duplicates (follow-ups).

Top 50 most frequent cases (caseid: number_of_versions):
caseid
11020195    25
12209653    22
11692275    21
9745027     21
14698949    21
7322498     20
11565068    20
9272239     20
10637106    20
9030666     20
7056790     19
14162985    19
10338343    19
12139469    19
9383196     19
14719399    18
10990475    18
10231642    18
15775809    18
12122305    18
12612206    18
7059978     18
9368878     18
11090837    18
12943377    18
12220151    18
10720490    18
9695444     18
10982063    18
12913352    18
10428818    18
12304566    18
9459293     18
11687892    18
14899555    18
17829120    18
11072666    18
16215378    18
17440764    18
11663523    18
8244014     18
14752295    18
8303475     17
9869027     17
8203402     17
17332255    1

In [6]:
import pandas as pd

print("Starting deduplication...")

# --- Configuration ---
# ‚ö†Ô∏è Update these paths
MASTER_DEMO_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_DEMO.csv"
UNIQUE_DEMO_FILE = r"D:\Python_practice\group-project-ml\data\processed\UNIQUE_DEMO.csv"
# ---------------------

# Load the master demo file
try:
    df_demo = pd.read_csv(
        MASTER_DEMO_FILE,
        dtype=str,  # Load all as strings to avoid date/number issues
        low_memory=False,
    )
    print(f"Loaded {len(df_demo)} records from {MASTER_DEMO_FILE}")

    # --- Deduplication Logic ---

    # 1. Convert date columns for sorting.
    #    'fda_dt' is for the latest version of the report.
    df_demo["fda_dt"] = pd.to_datetime(
        df_demo["fda_dt"], format="%Y%m%d", errors="coerce"
    )

    # 2. Convert key IDs to numeric for correct sorting
    df_demo["primaryid"] = pd.to_numeric(df_demo["primaryid"], errors="coerce")
    df_demo["caseid"] = pd.to_numeric(df_demo["caseid"], errors="coerce")

    # Handle any conversion errors (rows with bad data)
    df_demo.dropna(subset=["caseid", "fda_dt", "primaryid"], inplace=True)

    print("Sorting records to find the latest versions...")

    # 3. Sort by Case ID, then newest date, then highest primaryid
    df_demo.sort_values(
        by=["caseid", "fda_dt", "primaryid"],
        ascending=[True, False, False],  # caseid (asc), fda_dt (desc), primaryid (desc)
        inplace=True,
    )

    # 4. Keep only the FIRST row for each 'caseid'. This is the latest report.
    df_deduplicated = df_demo.drop_duplicates(subset=["caseid"], keep="first")

    print(f"\nOriginal records: {len(df_demo)}")
    print(
        f"Unique reports found: {len(df_deduplicated)}"
    )  # This should match your 'Unique patient stories' number

    # 5. Save the unique, deduplicated reports
    df_deduplicated.to_csv(UNIQUE_DEMO_FILE, index=False)
    print(f"‚úÖ Successfully saved unique reports to {UNIQUE_DEMO_FILE}")

except FileNotFoundError:
    print(f"Error: File not found at {MASTER_DEMO_FILE}")
except Exception as e:
    print(f"An error occurred: {e}")

Starting deduplication...
Loaded 16935987 records from D:\Python_practice\group-project-ml\data\raw\MASTER_DEMO.csv
Sorting records to find the latest versions...

Original records: 16935987
Unique reports found: 14557323
‚úÖ Successfully saved unique reports to D:\Python_practice\group-project-ml\data\processed\UNIQUE_DEMO.csv


In [7]:
import pandas as pd
import numpy as np
import os

print("üöÄ Starting creation of FINAL_MASTER_DATASET.csv...")

# --- 1. Configuration: UPDATE ALL FILE PATHS ---

# Base file (the 14.5M unique reports)
UNIQUE_DEMO_FILE = r"D:\Python_practice\group-project-ml\data\processed\UNIQUE_DEMO.csv"


# Source master files (the ones you already built)
MASTER_OUTC_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_OUTC.csv"
MASTER_DRUG_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_DRUG.csv"
MASTER_REAC_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_REAC.csv"
MASTER_INDI_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_INDI.csv"
MASTER_THER_FILE = r"D:\Python_practice\group-project-ml\data\raw\MASTER_THER.csv"

# Final output file
FINAL_DATASET_FILE = r"D:\Python_practice\group-project-ml\data\processed\FINAL_MASTER_DATASET.csv"

# --- 2. Define Project Lists ---

# List of "Severe" outcomes for Project 1's target
SEVERE_OUTCOMES = ["DE", "LT", "HO", "DS"]

# List of "Inefficacy" terms for Project 2's target
# We must use uppercase for matching
INEFFICACY_TERMS = [
    "DRUG INEFFECTIVE",
    "LACK OF EFFECT",
    "THERAPEUTIC RESPONSE DECREASED",
    "CONDITION WORSENED",
]

# --- 3. Feature Engineering Functions ---


def process_outcomes(filepath):
    print("  Processing Outcomes (OUTC)...")
    try:
        df = pd.read_csv(filepath, dtype=str, usecols=["primaryid", "outc_cod"])
        df.dropna(inplace=True)

        # Mark '1' if the outcome is in our severe list
        df["is_severe_num"] = df["outc_cod"].isin(SEVERE_OUTCOMES).astype(int)

        # Aggregate: Group by primaryid and take the MAX.
        # (If any outcome is severe, the whole report is severe)
        df_agg = (
            df.groupby("primaryid")["is_severe_num"].max().to_frame("is_severe_outcome")
        )

        print(f"    Done. Found {len(df_agg)} aggregated outcomes.")
        return df_agg
    except FileNotFoundError:
        print(f"    ERROR: OUTC file not found at {filepath}")
        return None


def process_drugs(filepath):
    print("  Processing Drugs (DRUG)...")
    try:
        df = pd.read_csv(filepath, dtype=str, usecols=["primaryid"])

        # Aggregate: Just count the number of rows per primaryid
        df_agg = df.groupby("primaryid").size().to_frame("drug_count")

        print(f"    Done. Found {len(df_agg)} aggregated drug counts.")
        return df_agg
    except FileNotFoundError:
        print(f"    ERROR: DRUG file not found at {filepath}")
        return None


def process_indications(filepath):
    print("  Processing Indications (INDI)...")
    try:
        df = pd.read_csv(filepath, dtype=str, usecols=["primaryid"])

        # Aggregate: Count number of indications per primaryid
        df_agg = df.groupby("primaryid").size().to_frame("indication_count")

        print(f"    Done. Found {len(df_agg)} aggregated indication counts.")
        return df_agg
    except FileNotFoundError:
        print(f"    ERROR: INDI file not found at {filepath}")
        return None


def process_reactions(filepath):
    print("  Processing Reactions (REAC)...")
    try:
        df = pd.read_csv(filepath, dtype=str, usecols=["primaryid", "pt"])
        df.dropna(inplace=True)

        # Standardize: Make all reaction terms uppercase for matching
        df["pt_upper"] = df["pt"].str.upper()

        # Aggregate 1: Count reactions
        agg_count = df.groupby("primaryid").size().to_frame("reaction_count")

        # Aggregate 2: Create 'all_reaction_pts' for Project 2 clustering
        agg_pts = (
            df.groupby("primaryid")["pt_upper"]
            .apply(" ".join)
            .to_frame("all_reaction_pts")
        )

        # Aggregate 3: Create 'is_ineffective' for Project 2 target
        def check_ineffective(x):
            return any(term in INEFFICACY_TERMS for term in x)

        agg_ineffective = (
            df.groupby("primaryid")["pt_upper"]
            .apply(check_ineffective)
            .astype(int)
            .to_frame("is_ineffective")
        )

        # Merge the 3 aggregated DFs
        df_agg = agg_count.join(agg_pts, how="outer").join(agg_ineffective, how="outer")

        print(f"    Done. Found {len(df_agg)} aggregated reaction features.")
        return df_agg
    except FileNotFoundError:
        print(f"    ERROR: REAC file not found at {filepath}")
        return None


def process_therapy(filepath):
    print("  Processing Therapy (THER)...")
    try:
        df = pd.read_csv(filepath, dtype=str, usecols=["primaryid", "dur", "dur_cod"])
        df.dropna(inplace=True)

        # Convert duration to a number
        df["dur"] = pd.to_numeric(df["dur"], errors="coerce")
        df.dropna(subset=["dur"], inplace=True)

        # Map codes to day multipliers
        duration_map = {
            "DY": 1,
            "WK": 7,
            "MO": 30.4375,  # Avg days in month
            "YR": 365.25,  # Account for leap year
        }

        df["multiplier"] = df["dur_cod"].map(duration_map)
        df.dropna(subset=["multiplier"], inplace=True)

        # Calculate duration in days
        df["duration_in_days"] = df["dur"] * df["multiplier"]

        # Aggregate: Find the *maximum* therapy duration reported for a case
        df_agg = (
            df.groupby("primaryid")["duration_in_days"]
            .max()
            .to_frame("therapy_duration_days")
        )

        print(f"    Done. Found {len(df_agg)} aggregated therapy durations.")
        return df_agg
    except FileNotFoundError:
        print(f"    ERROR: THER file not found at {filepath}")
        return None


# --- 4. Main Execution ---

print("\n--- Phase 1: Aggregating Relational Files ---")
df_outc = process_outcomes(MASTER_OUTC_FILE)
df_drug = process_drugs(MASTER_DRUG_FILE)
df_indi = process_indications(MASTER_INDI_FILE)
df_reac = process_reactions(MASTER_REAC_FILE)
df_ther = process_therapy(MASTER_THER_FILE)

# Create a list of all aggregated dataframes that exist
aggregated_dfs = [df_outc, df_drug, df_indi, df_reac, df_ther]
aggregated_dfs = [df for df in aggregated_dfs if df is not None]

print("\n--- Phase 2: Loading Base `UNIQUE_DEMO` File ---")
try:
    df_base = pd.read_csv(UNIQUE_DEMO_FILE, dtype=str)
    # Set primaryid as index for efficient joining
    df_base["primaryid"] = pd.to_numeric(df_base["primaryid"], errors="coerce")
    df_base.set_index("primaryid", inplace=True)
    print(f"Loaded {len(df_base)} unique reports from UNIQUE_DEMO.csv")
except FileNotFoundError:
    print(f"FATAL ERROR: UNIQUE_DEMO_FILE not found at {UNIQUE_DEMO_FILE}")
    exit()

print("\n--- Phase 3: Joining All DataFrames ---")
# Make sure aggregated DFs have the same index type
for df in aggregated_dfs:
    df.index = pd.to_numeric(df.index, errors="coerce")

# Join all aggregated DFs to the base demo file
df_final = df_base.join(aggregated_dfs, how="left")

print("  All joins complete.")

print("\n--- Phase 4: Final Cleaning and Saving ---")

# Fill NaNs created by joins
# 'count' features should be 0 (0 drugs, 0 reactions)
count_cols = ["drug_count", "indication_count", "reaction_count"]
for col in count_cols:
    if col in df_final.columns:
        df_final[col] = df_final[col].fillna(0).astype(int)

# 'outcome' features should be 0 (not severe, not ineffective)
outcome_cols = ["is_severe_outcome", "is_ineffective"]
for col in outcome_cols:
    if col in df_final.columns:
        df_final[col] = df_final[col].fillna(0).astype(int)

# 'text' features should be empty string
if "all_reaction_pts" in df_final.columns:
    df_final["all_reaction_pts"] = df_final["all_reaction_pts"].fillna("")

# 'therapy_duration_days' should stay as NaN (it's a regression target)

# Create the final target for Project 2: 'is_failure'
if "is_severe_outcome" in df_final.columns and "is_ineffective" in df_final.columns:
    df_final["is_failure"] = (
        (df_final["is_severe_outcome"] == 1) | (df_final["is_ineffective"] == 1)
    ).astype(int)
    print("  Created final 'is_failure' target column.")

# Save the final dataset
df_final.to_csv(FINAL_DATASET_FILE)

print(f"\nüéâ SUCCESS! üéâ\nFinal flattened dataset saved to: {FINAL_DATASET_FILE}")
print(f"It contains {len(df_final)} rows and {len(df_final.columns)} columns.")

üöÄ Starting creation of FINAL_MASTER_DATASET.csv...

--- Phase 1: Aggregating Relational Files ---
  Processing Outcomes (OUTC)...
    Done. Found 9693475 aggregated outcomes.
  Processing Drugs (DRUG)...
    Done. Found 16482610 aggregated drug counts.
  Processing Indications (INDI)...
    Done. Found 15544380 aggregated indication counts.
  Processing Reactions (REAC)...
    Done. Found 16935474 aggregated reaction features.
  Processing Therapy (THER)...
    Done. Found 82794 aggregated therapy durations.

--- Phase 2: Loading Base `UNIQUE_DEMO` File ---
Loaded 14557323 unique reports from UNIQUE_DEMO.csv

--- Phase 3: Joining All DataFrames ---
  All joins complete.

--- Phase 4: Final Cleaning and Saving ---
  Created final 'is_failure' target column.

üéâ SUCCESS! üéâ
Final flattened dataset saved to: D:\Python_practice\group-project-ml\data\processed\FINAL_MASTER_DATASET.csv
It contains 14557323 rows and 32 columns.
