In [2]:
import pandas as pd

In [5]:
data

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
1,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
3,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1
4,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1
...,...,...,...,...,...,...,...,...,...,...
417,LUNG1-418,53.6712,2.0,0,0,I,adenocarcinoma,male,346,1
418,LUNG1-419,66.5096,4.0,1,0,IIIb,squamous cell carcinoma,male,2772,0
419,LUNG1-420,73.3808,2.0,1,0,II,squamous cell carcinoma,male,2429,1
420,LUNG1-421,61.7041,2.0,2,0,IIIa,squamous cell carcinoma,female,369,1


In [13]:
data[data['clinical.T.Stage']==5]

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
55,LUNG1-056,,5.0,2,0,IIIa,,female,4454,0
271,LUNG1-272,60.1396,5.0,2,0,,large cell,male,288,1


In [15]:
data = pd.read_csv("../data/NSCLC-Radiomics-Lung1.clinical-version3-Oct-2019.csv")

def calculate_percentage(value, total):
    return f"{(value / total * 100):.1f}%"

total_patients = len(data)
mean_age = data['age'].mean()
std_age = data['age'].std()

gender_counts = data['gender'].value_counts()
gender_distribution = f"{calculate_percentage(gender_counts['male'], total_patients)}/{calculate_percentage(gender_counts['female'], total_patients)}"

deceased_percentage = calculate_percentage(data['deadstatus.event'].sum(), total_patients)

nodal_failure_percentage = calculate_percentage((data['Clinical.N.Stage'] >= 2).sum(), total_patients)
local_failure_percentage = calculate_percentage((data['clinical.T.Stage'] >= 4).sum(), total_patients)
metastasis_percentage = calculate_percentage((data['Clinical.M.Stage'] == 1).sum(), total_patients)

t_stage_distribution = data['clinical.T.Stage'].value_counts(normalize=True).sort_index().apply(lambda x: f"{x:.1%}").to_dict()
histology_distribution = data['Histology'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}").to_dict()

# Prepare table
table = {
    "Age (years)": f"{mean_age:.1f} ± {std_age:.1f}",
    "Sex (M/F)": gender_distribution,
    "Deceased": deceased_percentage,
    "Nodal failure": nodal_failure_percentage,
    "Local failure": local_failure_percentage,
    "Metastasis": metastasis_percentage,
    "T stage (T1/T2/T3/T4)": " / ".join([t_stage_distribution.get(i, '0%') for i in range(1, 5)]),
    "Histology (squamous cell carcinoma/large cell/others/adenocarcinoma)": " / ".join(histology_distribution.values()),
}

# Convert table to DataFrame for better display
summary_table = pd.DataFrame.from_dict(table, orient='index', columns=["Value"])
summary_table.to_excel('data_stats.xlsx')


In [14]:
t_stage_distribution

{1.0: '22.1%', 2.0: '37.1%', 3.0: '12.6%', 4.0: '27.8%', 5.0: '0.5%'}

In [8]:
histology_distribution

{'squamous cell carcinoma': '40.0%',
 'large cell': '30.0%',
 'nos': '16.6%',
 'adenocarcinoma': '13.4%'}