# TM10007 Assignment template

In [None]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [10]:
#Load packages

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Classifiers
from sklearn import model_selection


# Importing Data loading functions. Uncomment the one you want to use
from worcliver.load_data import load_data

from scipy.stats import ttest_ind, mannwhitneyu, shapiro
from statsmodels.stats.multitest import multipletests


In [None]:
# Loading the data
data = load_data()


In [None]:
# Description of the data
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

# Display basic info
print("First five rows of the dataset:")
print(data.head(), "\n")

print("Dataset info:")
print(data.info(), "\n")

print("Summary statistics:")
print(data.describe(), "\n")

# Counting missing values
missing_values = data.isnull().sum()
total_missing = missing_values.sum()

print(f"Total missing values in the dataset: {total_missing}\n")
print(f"Missing values per column:\n{missing_values[missing_values > 0]}\n")

# Counting categorical and numerical columns
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(exclude=['object']).columns

print(f"Number of categorical columns: {len(categorical_columns)}")
print(f"Number of numerical columns: {len(numerical_columns)}\n")

# Count of each label (benign/malignant)
label_counts = data['label'].value_counts()

print("Label distribution:")
print(label_counts)


In [4]:
# Configure numerical data
# Create a copy of the dataset
num_data = data.copy()

# Transform labels: benign -> 1, malignant -> 0
num_data['label'] = num_data['label'].map({'benign': 1, 'malignant': 0})
print(num_data)

             label  PREDICT_original_sf_compactness_avg_2.5D  \
ID                                                             
Liver-001_0      1                                  0.878471   
Liver-002_0      1                                  0.878945   
Liver-003_0      1                                  0.766162   
Liver-004_0      0                                  0.825737   
Liver-005_0      0                                  0.828831   
...            ...                                       ...   
Liver-182_0      0                                  0.720708   
Liver-183_0      1                                  0.784611   
Liver-184_0      1                                  0.811192   
Liver-185_0      0                                  0.753895   
Liver-186_0      0                                  0.755811   

             PREDICT_original_sf_compactness_std_2.5D  \
ID                                                      
Liver-001_0                                  0.023468

In [6]:
#Splitting data in test and train set 
y = num_data['label']
x = num_data[:]

#x is features, y  = maligne / benign
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

#print the shape of the data sets  
print(f'The shape of the train data: {x_train.shape}')
print(f'The shape of the test data: {x_test.shape}')


The shape of the train data: (148, 494)
The shape of the test data: (38, 494)


In [None]:
# #Data deviden in bengign and malignant 
# benign = data[data['label'] == 'benign']
# malignant = data[data['label'] == 'malignant']

# benign.to_csv('benign_data.csv', index=True)
# malignant.to_csv('malignant_data.csv', index=True)


In [None]:
#Decription 
## Dit nog aanpassen naar berekenen op basis van 'label' ipv deze nieuwe csv files
# Mean
benign_mean = benign.mean(numeric_only=True)
malignant_mean = malignant.mean(numeric_only=True)

# Standard deviation
benign_std = benign.std(numeric_only=True)
malignant_std = malignant.std(numeric_only=True)

#Print statistics
benign_stats = pd.DataFrame({
    'mean': benign.mean(numeric_only=True),
    'std': benign.std(numeric_only=True)
})
print(benign_stats)
benign_stats.T.to_csv('benign_stats.csv')


malignant_stats = pd.DataFrame({
    'mean': malignant.mean(numeric_only=True),
    'std': malignant.std(numeric_only=True)
})
malignant_stats.T.to_csv('malignant_stats.csv')

In [None]:
# Select significant features

# Add the label column back to x_train
x_train_with_label = x_train.copy()
x_train_with_label['label'] = y_train

# Separate benign and malignant samples in training data
benign = x_train_with_label[x_train_with_label['label'] == 1]
malignant = x_train_with_label[x_train_with_label['label'] == 0]

# Perform statistical analysis
features = []
sig_features = []

# Loop through all features except 'label'
for feature in x_train.columns:
    # Data for current feature
    benign_values = benign[feature].dropna()
    malignant_values = malignant[feature].dropna()

    # Normality test (Shapiro-Wilk test, p < 0.05 means NOT normally distributed)
    _, p_benign = shapiro(benign_values)
    _, p_malignant = shapiro(malignant_values)

    # Determine which test to use
    if p_benign > 0.05 and p_malignant > 0.05:  # Both distributions are normal
        test_type = "t-test"
        stat, p_value = ttest_ind(benign_values, malignant_values, equal_var=False)  # Welch's t-test
    else:
        test_type = "Mann-Whitney U-test"
        stat, p_value = mannwhitneyu(benign_values, malignant_values, alternative='two-sided')

    # Save results
    feature_entry = {
        "Feature": feature, 
        "Test": test_type, 
        "p_value": p_value
    }
    features.append(feature_entry)

    # Check for significance (p <= 0.05)
    if p_value <= 0.05:
        sig_entry = feature_entry.copy()
        # sig_entry['benign_mean'] = benign_values.mean()
        # sig_entry['malignant_mean'] = malignant_values.mean()
        # sig_entry['benign_std'] = benign_values.std()
        # sig_entry['malignant_std'] = malignant_values.std()
        sig_features.append(sig_entry)

# Convert to DataFrames
features_df = pd.DataFrame(features)
sig_features_df = pd.DataFrame(sig_features)

# Multiple testing correction (False Discovery Rate - Benjamini-Hochberg)
## checken of nodig
_, p_corrected, _, _ = multipletests(features_df["p_value"], method='fdr_bh')
features_df["p_value_corrected"] = p_corrected

# Sort significant features by p-value
sig_features_df = sig_features_df.sort_values(by="p_value")

# Print and save results
print("\nTotal Significant Features:")
print(sig_features_df)
print(f"\nNumber of significant features: {len(sig_features_df)}")

# Get the list of significant feature names
sig_feature_names = sig_features_df['Feature'].tolist()

# Select only significant features for training data
x_train_sig = x_train[sig_feature_names].copy()

# # Print the shape of the datasets
print("\nDataset Shapes:")
print(f'Original training data shape: {x_train.shape}')
print(f'Significant features training data shape: {x_train_sig.shape}')

# # Print significant feature names
# print("\nSignificant Features:")
# print(sig_feature_names)

#sig_features_df.to_csv('sig.csv')
x_train_sig.to_csv('x_train_sig.csv')


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)



Total Significant Features:
                                               Feature                 Test  \
0                                                label               t-test   
51                PREDICT_original_tf_LBP_peak_R15_P36               t-test   
141  PREDICT_original_vf_Frangi_edge_quartile_range...  Mann-Whitney U-test   
135  PREDICT_original_vf_Frangi_full_quartile_range...  Mann-Whitney U-test   
147  PREDICT_original_vf_Frangi_inner_range_SR(1.0,...  Mann-Whitney U-test   
..                                                 ...                  ...   
91            PREDICT_original_tf_Gabor_min_F0.2_A0.79  Mann-Whitney U-test   
13                            PREDICT_original_hf_peak  Mann-Whitney U-test   
52              PREDICT_original_tf_LBP_energy_R15_P36  Mann-Whitney U-test   
28                    PREDICT_original_logf_min_sigma5  Mann-Whitney U-test   
121         PREDICT_original_tf_Gabor_range_F0.5_A1.57  Mann-Whitney U-test   

          p_value  
0 