Migrant cohort validation study

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/workspaces/migrant-cohort-validation/output/dataset.csv.gz")

# --- 1. Numerical Summary (using describe() for continuous variables)
numeric_cols = [
    'number_of_migration_codes','time_to_first_migration_code'
]

# Numerical summary (mean, std, etc.)
numeric_summary = df[numeric_cols].describe().T
numeric_summary = numeric_summary[['count', 'mean', 'std', 'min','50%',  'max']]
numeric_summary.columns = ['Count', 'Mean', 'SD', 'Min', 'Median', 'Max']

# --- 2. Categorical Summary (count & percent)
categorical_cols = [
    'sex', 'latest_ethnicity_group', 'has_cob_migrant_code',
    'has_asylum_or_refugee_migrant_code', 'has_interpreter_migrant_code'
]

# Initialize an empty list to collect categorical summaries
categorical_summary = []
for col in categorical_cols:
    freq = df[col].value_counts(dropna=False).to_frame(name='Count')
    freq['Percent'] = 100 * freq['Count'] / len(df)
    freq.columns = [f'{col} Count', f'{col} Percent']
    categorical_summary.append(freq)

# --- 3. Combine Both Summaries

# Merge numerical and categorical summaries (ensure index alignment)
final_summary = pd.concat([numeric_summary, 
                           pd.concat(categorical_summary, axis=1)], axis=0)

# --- 4. Display or Export the Final Summary Table
final_summary


Unnamed: 0,Count,Mean,SD,Min,Median,Max,sex Count,sex Percent,latest_ethnicity_group Count,latest_ethnicity_group Percent,has_cob_migrant_code Count,has_cob_migrant_code Percent,has_asylum_or_refugee_migrant_code Count,has_asylum_or_refugee_migrant_code Percent,has_interpreter_migrant_code Count,has_interpreter_migrant_code Percent
number_of_migration_codes,1000.0,4.305,3.909893,1.0,3.0,29.0,,,,,,,,,,
time_to_first_migration_code,997.0,6111.237713,7368.420385,0.0,3267.0,44886.0,,,,,,,,,,
male,,,,,,,512.0,51.2,,,,,,,,
female,,,,,,,488.0,48.8,,,,,,,,
,,,,,,,,,505.0,50.5,,,,,,
5.0,,,,,,,,,134.0,13.4,,,,,,
1.0,,,,,,,,,125.0,12.5,,,,,,
4.0,,,,,,,,,99.0,9.9,,,,,,
3.0,,,,,,,,,92.0,9.2,,,,,,
2.0,,,,,,,,,45.0,4.5,,,,,,
