# TM10007 Assignment template

In [None]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [10]:
# Data loading functions. Uncomment the one you want to use
from worcliver.load_data import load_data


data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


#Data deviden in bengign and malign
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

print(benign)
print(malignant)



The number of samples: 186
The number of columns: 494
              label  PREDICT_original_sf_compactness_avg_2.5D  \
ID                                                              
Liver-001_0  benign                                  0.878471   
Liver-002_0  benign                                  0.878945   
Liver-003_0  benign                                  0.766162   
Liver-006_0  benign                                  0.847468   
Liver-007_0  benign                                  0.774819   
...             ...                                       ...   
Liver-177_0  benign                                  0.771360   
Liver-178_0  benign                                  0.889657   
Liver-180_0  benign                                  0.689131   
Liver-183_0  benign                                  0.784611   
Liver-184_0  benign                                  0.811192   

             PREDICT_original_sf_compactness_std_2.5D  \
ID                                         

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, mannwhitneyu, shapiro
from statsmodels.stats.multitest import multipletests

# Check column names to ensure correct splitting
print("\nColumns in the dataset:")
print(data.columns)

# Separate features and labels
# Use .copy() to avoid SettingWithCopyWarning
X = data.drop(columns=['label']).copy()  # Drop label column for features
y = data['label'].copy()  # Use label column for target

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Verify split
print("\nTrain-Test Split:")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Klasseverdeling in train:\n{y_train.value_counts(normalize=True)}")
print(f"Klasseverdeling in test:\n{y_test.value_counts(normalize=True)}")

# Separate benign and malignant samples
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

# Perform statistical analysis
results = []
significant_features = []

# Loop through all features except 'label'
for feature in X.columns:
    # Data for current feature
    benign_values = benign[feature].dropna()
    malignant_values = malignant[feature].dropna()

    # Normality test (Shapiro-Wilk test, p < 0.05 means NOT normally distributed)
    _, p_benign = shapiro(benign_values)
    _, p_malignant = shapiro(malignant_values)

    # Determine which test to use
    if p_benign > 0.05 and p_malignant > 0.05:  # Both distributions are normal
        test_type = "t-test"
        stat, p_value = ttest_ind(benign_values, malignant_values, equal_var=False)  # Welch's t-test
    else:
        test_type = "Mann-Whitney U-test"
        stat, p_value = mannwhitneyu(benign_values, malignant_values, alternative='two-sided')

    # Save results
    result_entry = {
        "Feature": feature, 
        "Test": test_type, 
        "p_value": p_value
    }
    results.append(result_entry)

    # Check for significance (p <= 0.05)
    if p_value <= 0.05:
        significant_entry = result_entry.copy()
        significant_entry['benign_mean'] = benign_values.mean()
        significant_entry['malignant_mean'] = malignant_values.mean()
        significant_entry['benign_std'] = benign_values.std()
        significant_entry['malignant_std'] = malignant_values.std()
        significant_features.append(significant_entry)

# Convert to DataFrames
results_df = pd.DataFrame(results)
significant_features_df = pd.DataFrame(significant_features)

# Multiple testing correction (False Discovery Rate - Benjamini-Hochberg)
_, p_corrected, _, _ = multipletests(results_df["p_value"], method='fdr_bh')
results_df["p_value_corrected"] = p_corrected

# Sort significant features by p-value
significant_features_df = significant_features_df.sort_values(by="p_value")

# Print and save results
print("\nTotal Significant Features:")
print(significant_features_df)

# Optional: Save to CSV
significant_features_df.to_csv('significant_features.csv', index=False)
print("\nSignificant features saved to 'significant_features.csv'")

# Quick summary
print(f"\nNumber of significant features: {len(significant_features_df)}")


Columns in the dataset:
Index(['label', 'PREDICT_original_sf_compactness_avg_2.5D',
       'PREDICT_original_sf_compactness_std_2.5D',
       'PREDICT_original_sf_rad_dist_avg_2.5D',
       'PREDICT_original_sf_rad_dist_std_2.5D',
       'PREDICT_original_sf_roughness_avg_2.5D',
       'PREDICT_original_sf_roughness_std_2.5D',
       'PREDICT_original_sf_convexity_avg_2.5D',
       'PREDICT_original_sf_convexity_std_2.5D',
       'PREDICT_original_sf_cvar_avg_2.5D',
       ...
       'PREDICT_original_phasef_phasesym_median_WL3_N5',
       'PREDICT_original_phasef_phasesym_std_WL3_N5',
       'PREDICT_original_phasef_phasesym_skewness_WL3_N5',
       'PREDICT_original_phasef_phasesym_kurtosis_WL3_N5',
       'PREDICT_original_phasef_phasesym_peak_WL3_N5',
       'PREDICT_original_phasef_phasesym_peak_position_WL3_N5',
       'PREDICT_original_phasef_phasesym_range_WL3_N5',
       'PREDICT_original_phasef_phasesym_energy_WL3_N5',
       'PREDICT_original_phasef_phasesym_quartile_range_

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)



Total Significant Features:
                                              Feature                 Test  \
34                PREDICT_original_tf_LBP_peak_R3_P12               t-test   
18              PREDICT_original_logf_kurtosis_sigma1  Mann-Whitney U-test   
32                 PREDICT_original_tf_LBP_std_R3_P12  Mann-Whitney U-test   
92  PREDICT_original_vf_Frangi_full_quartile_range...  Mann-Whitney U-test   
98  PREDICT_original_vf_Frangi_edge_quartile_range...  Mann-Whitney U-test   
..                                                ...                  ...   
77  PREDICT_original_tf_Gabor_quartile_range_F0.5_...  Mann-Whitney U-test   
36             PREDICT_original_tf_LBP_entropy_R3_P12  Mann-Whitney U-test   
73           PREDICT_original_tf_Gabor_min_F0.2_A2.36  Mann-Whitney U-test   
48  PREDICT_original_tf_GLCMMS_homogeneityd3.0A2.3...  Mann-Whitney U-test   
35              PREDICT_original_tf_LBP_energy_R3_P12  Mann-Whitney U-test   

         p_value   benign_mean  ma

In [None]:


# Lijst om resultaten op te slaan
results = []

# Over alle numerieke features loopen (label overslaan)
for feature in data.columns:
    if feature == 'label':  # Skip de label kolom
        continue
    
    # Data voor huidige feature
    benign_values = benign[feature].dropna()
    malignant_values = malignant[feature].dropna()

    # Test op normaliteit (Shapiro-Wilk test, p < 0.05 betekent NIET normaal verdeeld)
    _, p_benign = shapiro(benign_values)  # Normaliteitstest voor benign
    _, p_malignant = shapiro(malignant_values)  # Normaliteitstest voor malignant

    # Bepaal welke test uit te voeren
    if p_benign > 0.05 and p_malignant > 0.05:  # Beide verdelingen zijn normaal
        test_type = "t-test"
        stat, p_value = ttest_ind(benign_values, malignant_values, equal_var=False)  # Welch’s t-test
    else:
        test_type = "Mann-Whitney U-test"
        stat, p_value = mannwhitneyu(benign_values, malignant_values, alternative='two-sided')

    # Opslaan van resultaten
    results.append({"Feature": feature, "Test": test_type, "p_value": p_value})

# Omzetten naar DataFrame
results_df = pd.DataFrame(results)

# Multiple testing correctie (False Discovery Rate - Benjamini-Hochberg)
_, p_corrected, _, _ = multipletests(results_df["p_value"], method='fdr_bh')
results_df["p_value_corrected"] = p_corrected

# Sorteren op significantie
sig_results_df = results_df.sort_values(by="p_value_corrected")

# Print de top 10 meest significante features
print(sig_results_df.head(10))

# t-test

# wil... test

# missing data