In [1]:
import pandas as pd
import glob
import os
import re
import networkx as nx

In [2]:
import pm4py
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

In [3]:
AD_Expo_path = 'data/AD_records_subcohort.csv'
Descendants_path = 'data/AD_descendants.csv'
df_analysis = pd.read_csv(AD_Expo_path)
descendants = pd.read_csv(Descendants_path)

In [4]:
unique_patients = df_analysis['PERSON_ID'].nunique()
print("Number of unique patients:", unique_patients)

Number of unique patients: 31941


In [5]:
df_analysis.shape

(436476, 16)

In [6]:
df = df_analysis

Cleaning

In [7]:
df = df.drop_duplicates(
    subset=[
        'PERSON_ID',
        'DRUG_CONCEPT_NAME',
        'DRUG_CONCEPT_ID',
        'DRUG_EXPOSURE_START_DATE',
        'DRUG_EXPOSURE_END_DATE'
    ],
    keep='first'
)

df.to_csv('data/AD_records_subcohort_deduped.csv', index=False)


In [8]:
unique_patients = df['PERSON_ID'].nunique()
print("Number of unique patients:", unique_patients)

Number of unique patients: 31941


In [9]:
df.shape

(415835, 16)

In [10]:
df = pd.read_csv('data/AD_records_subcohort_deduped.csv')

In [11]:
df['DRUG_EXPOSURE_START_DATE'] = pd.to_datetime(
    df['DRUG_EXPOSURE_START_DATE'], errors='coerce'
)

patient_stats = (
    df.groupby('PERSON_ID')
      .agg(
          first_start=('DRUG_EXPOSURE_START_DATE', 'min'),
          last_start=('DRUG_EXPOSURE_START_DATE', 'max'),
          n_prescriptions=('DRUG_CONCEPT_ID', 'nunique')
      )
      .assign(case_duration_days=lambda x: (x['last_start'] - x['first_start']).dt.days)
      .reset_index()
)

# Odd patients: duration = 0 and >1 prescription
odd_patients = patient_stats[
    (patient_stats['case_duration_days'] == 0) &
    (patient_stats['n_prescriptions'] > 2)
]

# All others
normal_patients = patient_stats[
    ~((patient_stats['case_duration_days'] == 0) & (patient_stats['n_prescriptions'] > 2))
]

df_normal = df[~df['PERSON_ID'].isin(odd_patients['PERSON_ID'])]
df_normal.to_csv('data/normal_patients.csv', index=False)

df_odd = df[df['PERSON_ID'].isin(odd_patients['PERSON_ID'])]
df_odd.to_csv('data/odd_patients.csv', index=False)

In [12]:
df_odd['DRUG_EXPOSURE_START_DATE'].isnull().sum()

np.int64(0)

In [13]:
df_normal['DRUG_EXPOSURE_START_DATE'].isnull().sum()

np.int64(0)

In [None]:
df_normal.head()

In [15]:
unique_patients = df_normal['PERSON_ID'].nunique()
print("Number of unique patients:", unique_patients)

Number of unique patients: 31881


In [16]:
df_normal.shape

(415641, 16)

Simplification to Level 2(Ingredients)

In [17]:
df = pd.read_csv('data/normal_patients.csv')

In [18]:
df.shape

(415641, 16)

In [19]:
merged = df.merge(
    descendants[['DESCENDANT_CONCEPT_ID', 'ANCESTOR_CONCEPT_NAME']],
    left_on='DRUG_CONCEPT_ID',
    right_on='DESCENDANT_CONCEPT_ID',
    how='left'
)

In [27]:
merged['DRUG_EXPOSURE_START_DATE'].isnull().sum()

np.int64(0)

In [20]:
merged.shape

(415641, 18)

In [None]:
merged.head()

In [22]:
merged.to_csv('data/simple2.csv', index=False)

Labeling

In [23]:
df = pd.read_csv('data/simple2.csv')

In [24]:
df.shape

(415641, 18)

In [25]:
unique_patients = df['PERSON_ID'].nunique()
print("Number of unique patients:", unique_patients)

Number of unique patients: 31881


In [26]:
def get_trd_status(patient_df, min_days=56):
    patient_df = patient_df.sort_values('DRUG_EXPOSURE_START_DATE')
    results = []
    curr_ad = None
    curr_start = None
    curr_end = None

    for _, row in patient_df.iterrows():
        # Use ancestor name instead of full concept name
        ad = row.get('ANCESTOR_CONCEPT_NAME')
        start = pd.to_datetime(row['DRUG_EXPOSURE_START_DATE'])
        end = pd.to_datetime(row['DRUG_EXPOSURE_END_DATE'])

        if curr_ad is None:
            curr_ad = ad
            curr_start = start
            curr_end = end
        elif ad == curr_ad:
            # If same drug, extend exposure window
            curr_end = max(curr_end, end)
        else:
            # New drug, save previous
            duration = (curr_end - curr_start).days
            results.append((curr_ad, duration))
            curr_ad = ad
            curr_start = start
            curr_end = end

    # Save the last one
    if curr_ad is not None:
        duration = (curr_end - curr_start).days
        results.append((curr_ad, duration))

    # Count number of unique adequate trials
    count = sum(1 for ad, dur in results if dur >= min_days)
    return 1 if count > 2 else 0

In [27]:
trd_status = (
    df.groupby('PERSON_ID').apply(get_trd_status).reset_index().rename(columns={0: 'TRD_STATUS'})
)
df = df.merge(trd_status, on='PERSON_ID', how='left')

  df.groupby('PERSON_ID').apply(get_trd_status).reset_index().rename(columns={0: 'TRD_STATUS'})


In [28]:
df_trd = df[df['TRD_STATUS'] == 1].copy()
df_nontrd = df[df['TRD_STATUS'] == 0].copy()

In [None]:
df_trd.head()

In [30]:
df_trd.shape

(184979, 19)

In [31]:
df_nontrd.shape

(230662, 19)

In [32]:
df_trd.to_csv('data/simple2_trd.csv', index=False)
df_nontrd.to_csv('data/simple2_nontrd.csv', index=False)

In [33]:
unique_trd_patients = df_trd['PERSON_ID'].nunique()
print("Number of unique trd patients:", unique_trd_patients)
unique_nontrd_patients = df_nontrd['PERSON_ID'].nunique()
print("Number of unique nontrd patients:", unique_nontrd_patients)

Number of unique trd patients: 4630
Number of unique nontrd patients: 27251


Categorization

In [56]:
path = 'data/AD_ancestors.csv'
anc_df = pd.read_csv(path)

In [57]:
trd_path = "data/simple2_trd.csv"
nontrd_path = "data/simple2_nontrd.csv"

df_trd = pd.read_csv(trd_path)
df_nontrd = pd.read_csv(nontrd_path)

In [58]:
df_trd.shape

(184979, 19)

In [59]:
df_nontrd.shape

(230662, 19)

In [62]:
print(anc_df.columns.tolist())

['ANCESTOR_CONCEPT_NAME', 'category', 'CONCEPT_ID']


In [63]:
anc_df['ANCESTOR_CONCEPT_NAME'] = (
    anc_df['ANCESTOR_CONCEPT_NAME']
      .str.strip()
      .str.lower()
)

mapping_df = anc_df.copy()  


In [64]:
mapping_df

Unnamed: 0,ANCESTOR_CONCEPT_NAME,category,CONCEPT_ID
0,citalopram,SSRI,797617
1,clopenthixol/zuclopenthixol,Augmentation,19010886
2,sertraline,SSRI,797617
3,escitalopram,SSRI,715939
4,fluoxetine,SSRI,755695
5,fluvoxamine,SSRI,751412
6,nefazodone,Atypical Antidepressants,714684
7,paroxetine,SSRI,722031
8,duloxetine,SNRIs,715259
9,venlafaxine,SNRIs,743670


In [65]:
dupes = (
    mapping_df['ANCESTOR_CONCEPT_NAME']
      .value_counts()
      .loc[lambda s: s > 1]
)
print(dupes.head())


Series([], Name: count, dtype: int64)


In [66]:
df_trd['ANCESTOR_CONCEPT_NAME'] = df_trd['ANCESTOR_CONCEPT_NAME'].str.strip().str.lower()
df_nontrd['ANCESTOR_CONCEPT_NAME'] = df_nontrd['ANCESTOR_CONCEPT_NAME'].str.strip().str.lower()

In [67]:
df_trd.shape

(184979, 19)

In [68]:
df_nontrd.shape

(230662, 19)

In [69]:
df_trd_cat = (
    df_trd
    .merge(mapping_df, on='ANCESTOR_CONCEPT_NAME', how='left')
    .loc[:, ['PERSON_ID', 'category', 'DRUG_EXPOSURE_START_DATE', 'DRUG_EXPOSURE_END_DATE']]
)


df_nontrd_cat = (
    df_nontrd
    .merge(mapping_df, on='ANCESTOR_CONCEPT_NAME', how='left')
    .loc[:, ['PERSON_ID', 'category', 'DRUG_EXPOSURE_START_DATE', 'DRUG_EXPOSURE_END_DATE']]
)

In [70]:
df_trd_cat.shape

(184979, 4)

In [71]:
df_nontrd_cat.shape

(230662, 4)

In [72]:
trd_path = "data/trd_category.csv"
nontrd_path = "data/nontrd_category.csv"

df_trd_cat.to_csv(trd_path, index=False)
df_nontrd_cat.to_csv(nontrd_path, index=False)

Event Log Generation

In [73]:
trd_path = "data/trd_category.csv"
nontrd_path = "data/nontrd_category.csv"

cat_trd_df = pd.read_csv(trd_path)
cat_nontrd_df = pd.read_csv(nontrd_path)

In [74]:
cols = ['PERSON_ID', 'category', 'DRUG_EXPOSURE_START_DATE']


print("cat_trd_df missing values:")
print(cat_trd_df[cols].isnull().sum())

print("\ncat_nontrd_df missing values:")
print(cat_nontrd_df[cols].isnull().sum())

cat_trd_df missing values:
PERSON_ID                   0
category                    0
DRUG_EXPOSURE_START_DATE    0
dtype: int64

cat_nontrd_df missing values:
PERSON_ID                   0
category                    0
DRUG_EXPOSURE_START_DATE    0
dtype: int64


In [75]:
trd_event_log = pm4py.format_dataframe(cat_trd_df, case_id='PERSON_ID', activity_key='category', timestamp_key='DRUG_EXPOSURE_START_DATE')
trd_event_log = pm4py.convert_to_event_log(trd_event_log)

In [76]:
nontrd_event_log = pm4py.format_dataframe(cat_nontrd_df, case_id='PERSON_ID', activity_key='category', timestamp_key='DRUG_EXPOSURE_START_DATE')
nontrd_event_log = pm4py.convert_to_event_log(nontrd_event_log)

In [77]:
xes_exporter.apply(trd_event_log, 'data/trd_event_log.xes')
xes_exporter.apply(nontrd_event_log, 'data/nontrd_event_log.xes')

exporting log, completed traces ::   0%|          | 0/4630 [00:00<?, ?it/s]

exporting log, completed traces ::   0%|          | 0/27251 [00:00<?, ?it/s]

In [78]:
cat_trd_df.shape

(184979, 7)

In [79]:
cat_nontrd_df.shape

(230662, 7)