# TM10007 Assignment template

In [None]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [4]:
import pandas as pd
import sklearn
import matplotlib
import torch
import numpy
import seaborn

# Importing Data loading functions. Uncomment the one you want to use
from worcliver.load_data import load_data

# Loading the data
data = load_data()

# Description of the data
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

# Display basic info
print("First five rows of the dataset:")
print(data.head(), "\n")

print("Dataset info:")
print(data.info(), "\n")

print("Summary statistics:")
print(data.describe(), "\n")

# Counting missing values
missing_values = data.isnull().sum()
total_missing = missing_values.sum()

print(f"Total missing values in the dataset: {total_missing}\n")
print(f"Missing values per column:\n{missing_values[missing_values > 0]}\n")

# Counting categorical and numerical columns
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(exclude=['object']).columns

print(f"Number of categorical columns: {len(categorical_columns)}")
print(f"Number of numerical columns: {len(numerical_columns)}\n")

# Count of each label (benign/malignant)
label_counts = data['label'].value_counts()

print("Label distribution:")
print(label_counts)

The number of samples: 186
The number of columns: 494
First five rows of the dataset:
                 label  PREDICT_original_sf_compactness_avg_2.5D  \
ID                                                                 
Liver-001_0     benign                                  0.878471   
Liver-002_0     benign                                  0.878945   
Liver-003_0     benign                                  0.766162   
Liver-004_0  malignant                                  0.825737   
Liver-005_0  malignant                                  0.828831   

             PREDICT_original_sf_compactness_std_2.5D  \
ID                                                      
Liver-001_0                                  0.023468   
Liver-002_0                                  0.039922   
Liver-003_0                                  0.064140   
Liver-004_0                                  0.062047   
Liver-005_0                                  0.062635   

             PREDICT_original_sf_rad_

In [None]:
#Data deviden in bengign and malignant 
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

benign.to_csv('benign_data.csv', index=True)
malignant.to_csv('malignant_data.csv', index=True)


In [None]:
#statistics 
# Mean
benign_mean = benign.mean(numeric_only=True)
malignant_mean = malignant.mean(numeric_only=True)

# Standard deviation
benign_std = benign.std(numeric_only=True)
malignant_std = malignant.std(numeric_only=True)

#Print statistics
benign_stats = pd.DataFrame({
    'mean': benign.mean(numeric_only=True),
    'std': benign.std(numeric_only=True)
})
print(benign_stats)
benign_stats.T.to_csv('benign_stats.csv')


malignant_stats = pd.DataFrame({
    'mean': malignant.mean(numeric_only=True),
    'std': malignant.std(numeric_only=True)
})
malignant_stats.T.to_csv('malignant_stats.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, mannwhitneyu, shapiro
from statsmodels.stats.multitest import multipletests

# Check column names to ensure correct splitting
print("\nColumns in the dataset:")
print(data.columns)

# Separate features and labels
# Use .copy() to avoid SettingWithCopyWarning
X = data.drop(columns=['label']).copy()  # Drop label column for features
y = data['label'].copy()  # Use label column for target

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Verify split
print("\nTrain-Test Split:")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Klasseverdeling in train:\n{y_train.value_counts(normalize=True)}")
print(f"Klasseverdeling in test:\n{y_test.value_counts(normalize=True)}")

# Separate benign and malignant samples
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

# Perform statistical analysis
results = []
significant_features = []

# Loop through all features except 'label'
for feature in X.columns:
    # Data for current feature
    benign_values = benign[feature].dropna()
    malignant_values = malignant[feature].dropna()

    # Normality test (Shapiro-Wilk test, p < 0.05 means NOT normally distributed)
    _, p_benign = shapiro(benign_values)
    _, p_malignant = shapiro(malignant_values)

    # Determine which test to use
    if p_benign > 0.05 and p_malignant > 0.05:  # Both distributions are normal
        test_type = "t-test"
        stat, p_value = ttest_ind(benign_values, malignant_values, equal_var=False)  # Welch's t-test
    else:
        test_type = "Mann-Whitney U-test"
        stat, p_value = mannwhitneyu(benign_values, malignant_values, alternative='two-sided')

    # Save results
    result_entry = {
        "Feature": feature, 
        "Test": test_type, 
        "p_value": p_value
    }
    results.append(result_entry)

    # Check for significance (p <= 0.05)
    if p_value <= 0.05:
        significant_entry = result_entry.copy()
        significant_entry['benign_mean'] = benign_values.mean()
        significant_entry['malignant_mean'] = malignant_values.mean()
        significant_entry['benign_std'] = benign_values.std()
        significant_entry['malignant_std'] = malignant_values.std()
        significant_features.append(significant_entry)

# Convert to DataFrames
results_df = pd.DataFrame(results)
significant_features_df = pd.DataFrame(significant_features)

# Multiple testing correction (False Discovery Rate - Benjamini-Hochberg)
_, p_corrected, _, _ = multipletests(results_df["p_value"], method='fdr_bh')
results_df["p_value_corrected"] = p_corrected

# Sort significant features by p-value
significant_features_df = significant_features_df.sort_values(by="p_value")

# Print and save results
print("\nTotal Significant Features:")
print(significant_features_df)

# Optional: Save to CSV
significant_features_df.to_csv('significant_features.csv', index=False)
print("\nSignificant features saved to 'significant_features.csv'")

# Quick summary
print(f"\nNumber of significant features: {len(significant_features_df)}")