In [22]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy import stats

In [None]:
# Data not saved 1-1 in CSV/DTA format, drug labels are different
# df = pd.read_csv('cancer.csv')
df = pd.read_stata('data/cancer.dta')

In [59]:
print(df.info())
print("\n", df.describe())
print("\n", df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   studytime  48 non-null     int8    
 1   died       48 non-null     category
 2   drug       48 non-null     category
 3   age        48 non-null     int8    
 4   _st        48 non-null     int8    
 5   _d         48 non-null     int8    
 6   _t         48 non-null     int8    
 7   _t0        48 non-null     int8    
dtypes: category(2), int8(6)
memory usage: 772.0 bytes
None

        studytime        age   _st         _d        _t   _t0
count   48.00000  48.000000  48.0  48.000000  48.00000  48.0
mean    15.50000  55.875000   1.0   0.645833  15.50000   0.0
std     10.25629   5.659205   0.0   0.483321  10.25629   0.0
min      1.00000  47.000000   1.0   0.000000   1.00000   0.0
25%      7.75000  50.750000   1.0   0.000000   7.75000   0.0
50%     12.50000  56.000000   1.0   1.000000  12.50000   0.0

In [68]:
set(df.drug.values)

{'Drug A', 'Drug B', 'Placebo'}

In [67]:
# Fix drug labels based on the actual mapping
# Map Other -> Drug A, NA/missing -> Drug B
drug_map = {
    'Placebo': 'Placebo',
    'Other': 'Drug A',
    'NA': 'Drug B'
}
df['drug'] = df['drug'].map(drug_map)

In [69]:
# Convert died from Yes/No to 1/0
df['died_numeric'] = df['died'].map({'Yes': 1.0, 'No': 0})
df['died_numeric'] = pd.to_numeric(df['died_numeric'], errors='coerce').fillna(0).astype(int)

# Ensure age and studytime are numeric
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['studytime'] = pd.to_numeric(df['studytime'], errors='coerce')

In [70]:
# Create age groups
conditions = [
    (df['age'] < 50),
    (df['age'] >= 50) & (df['age'] < 60),
    (df['age'] >= 60) & (df['age'] < 70),
    (df['age'] >= 70)
]
choices = ['Under 50', '50-59', '60-69', '70+']
df['age_group'] = np.select(conditions, choices, default=np.nan)

# Create survival time in years
df['survival_years'] = df['studytime'] / 12

# Generate patient ID and enrollment period
df['patient_id'] = range(1, len(df) + 1)
df['enrollment_period'] = np.ceil(df['patient_id'] / 12)

In [71]:
summary_stats = df.groupby('drug').agg({
    'survival_years': 'mean',
    'age': 'mean',
    'died_numeric': ['sum', 'count']
})
summary_stats.columns = ['mean_survival_years', 'mean_age', 'total_died', 'n_patients']
print(summary_stats)

         mean_survival_years   mean_age  total_died  n_patients
drug                                                           
Placebo             0.750000  56.050000          19          20
Drug A              1.244048  56.928571           6          14
Drug B              2.113095  54.571429           6          14


  summary_stats = df.groupby('drug').agg({


In [75]:
# Collapse data by drug and age group
collapsed = df.groupby(['drug', 'age_group']).agg({
    'survival_years': 'mean',
    'age': 'mean',
    'died_numeric': ['sum', 'count']
}).reset_index()

# Flatten column names
collapsed.columns = ['drug', 'age_group', 'mean_survival', 'mean_age', 'n_died', 'n_patients']

# Filter out any rows with 70+ age group - not in STATA output
# collapsed = collapsed[collapsed['age_group'] != '70+']

# Sort to match STATA output order - specify both drug and age_group order
drug_order = ['Placebo', 'Drug A', 'Drug B']
age_order = ['Under 50', '50-59', '60-69']  # Explicit age group order

collapsed['drug'] = pd.Categorical(collapsed['drug'], categories=drug_order, ordered=True)
collapsed['age_group'] = pd.Categorical(collapsed['age_group'], categories=age_order, ordered=True)
collapsed = collapsed.sort_values(['drug', 'age_group'])

  collapsed = df.groupby(['drug', 'age_group']).agg({


In [None]:
print(collapsed)
collapsed.to_csv('output/cancer_summary_results_jupyter.csv', index=False)

      drug age_group  mean_survival   mean_age  n_died  n_patients
2  Placebo  Under 50       1.027778  49.000000       3           3
0  Placebo     50-59       0.833333  54.666667      11          12
1  Placebo     60-69       0.383333  63.600000       5           5
5   Drug A  Under 50       1.375000  48.000000       1           2
3   Drug A     50-59       1.476190  54.142857       2           7
4   Drug A     60-69       0.866667  64.400000       3           5
8   Drug B  Under 50       2.277778  48.333333       0           3
6   Drug B     50-59       1.968750  54.625000       5           8
7   Drug B     60-69       2.333333  60.666667       1           3


In [None]:
# Final check - import CSVs to check same values
jupyter_df = pd.read_csv('output/cancer_summary_results_jupyter.csv')
stata_df = pd.read_csv('output/cancer_summary_results_stata.csv')  

jupyter_sorted = jupyter_df.sort_values(['drug', 'age_group']).reset_index(drop=True)
stata_sorted = stata_df.sort_values(['drug', 'age_group']).reset_index(drop=True)

# Numerical comparison - same mean values but w/ small floating point difference
for col in ['mean_survival', 'mean_age']:
    jupyter_vals = jupyter_sorted[col].values
    stata_vals = stata_sorted[col].values
    max_diff = np.max(np.abs(jupyter_vals - stata_vals))
    mean_diff = np.mean(np.abs(jupyter_vals - stata_vals))
    print(f"{col}: max_diff={max_diff:.8f}, mean_diff={mean_diff:.8f}")

mean_survival: max_diff=0.00000008, mean_diff=0.00000003
mean_age: max_diff=0.00000200, mean_diff=0.00000090
