# TM10007 Assignment template

In [None]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [None]:
import pandas as pd
import sklearn
import matplotlib
import torch
import numpy
import seaborn

# Importing Data loading functions. Uncomment the one you want to use
from worcliver.load_data import load_data

# Loading the data
data = load_data()

# Description of the data
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

# Display basic info
print("First five rows of the dataset:")
print(data.head(), "\5")

print("Dataset info:")
print(data.info(), "\5")

print("Summary statistics:")
print(data.describe(), "\5")

print("Checking for missing values:")
print(data.isnull().sum(), "\5")

# Countinng the number of benign and malignant samples
label_counts = data['label'].value_counts()

print("Label distribution:")
print(label_counts)


The number of samples: 186
The number of columns: 494
The number of missing data: 0
                                                          mean          std
PREDICT_original_sf_compactness_avg_2.5D              0.805954     0.076541
PREDICT_original_sf_compactness_std_2.5D              0.068240     0.050810
PREDICT_original_sf_rad_dist_avg_2.5D                28.539695    11.786114
PREDICT_original_sf_rad_dist_std_2.5D                 4.498306     2.419078
PREDICT_original_sf_roughness_avg_2.5D                8.326638     3.338394
...                                                        ...          ...
PREDICT_original_phasef_phasesym_peak_position_...    0.000000     0.000000
PREDICT_original_phasef_phasesym_range_WL3_N5         0.434737     0.103427
PREDICT_original_phasef_phasesym_energy_WL3_N5      681.212476  1050.837965
PREDICT_original_phasef_phasesym_quartile_range...    0.088811     0.094786
PREDICT_original_phasef_phasesym_entropy_WL3_N5      11.990334     1.849521

[49

In [None]:
# Configuring numerical data

# Function to check column types
def check_numerical_columns(df):
    non_numerical_cols = []

    for column in df.columns:
        try:
            pd.to_numeric(df[column], errors='raise')
        except (ValueError, TypeError):
            non_numerical_cols.append(column)
    
    return non_numerical_cols

# Get non-numerical columns
non_numerical_columns = check_numerical_columns(data)

# Print results
print("Total columns:", len(data.columns))
print("\nNon-Numerical Columns:")
print(f"Count: {len(non_numerical_columns)}")
print(non_numerical_columns)

# Create a copy of the dataset
num_data = data.copy()

# Transform labels: benign -> 1, malignant -> 0
num_data['label'] = num_data['label'].map({'benign': 1, 'malignant': 0})

num_data.to_csv('num_data.csv', index=False)


Total columns: 494

Non-Numerical Columns:
Count: 1
['label']
Original unique labels:
['benign' 'malignant']

Transformed unique labels:
[1 0]

Class distribution:
label
0    94
1    92
Name: count, dtype: int64


In [None]:
#Splitting data in test and train set 


In [None]:
#Data deviden in bengign and malignant 
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

benign.to_csv('benign_data.csv', index=True)
malignant.to_csv('malignant_data.csv', index=True)


In [None]:
#statistics 
# Mean
benign_mean = benign.mean(numeric_only=True)
malignant_mean = malignant.mean(numeric_only=True)

# Standard deviation
benign_std = benign.std(numeric_only=True)
malignant_std = malignant.std(numeric_only=True)

#Print statistics
benign_stats = pd.DataFrame({
    'mean': benign.mean(numeric_only=True),
    'std': benign.std(numeric_only=True)
})
print(benign_stats)
benign_stats.T.to_csv('benign_stats.csv')


malignant_stats = pd.DataFrame({
    'mean': malignant.mean(numeric_only=True),
    'std': malignant.std(numeric_only=True)
})
malignant_stats.T.to_csv('malignant_stats.csv')

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, mannwhitneyu, shapiro
from statsmodels.stats.multitest import multipletests

# Check column names to ensure correct splitting
print("\nColumns in the dataset:")
print(data.columns)

# Separate features and labels
# Use .copy() to avoid SettingWithCopyWarning
X = data.drop(columns=['label']).copy()  # Drop label column for features
y = data['label'].copy()  # Use label column for target

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Verify split
print("\nTrain-Test Split:")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Klasseverdeling in train:\n{y_train.value_counts(normalize=True)}")
print(f"Klasseverdeling in test:\n{y_test.value_counts(normalize=True)}")

# Separate benign and malignant samples
benign = data[data['label'] == 'benign']
malignant = data[data['label'] == 'malignant']

# Perform statistical analysis
results = []
significant_features = []

# Loop through all features except 'label'
for feature in X.columns:
    # Data for current feature
    benign_values = benign[feature].dropna()
    malignant_values = malignant[feature].dropna()

    # Normality test (Shapiro-Wilk test, p < 0.05 means NOT normally distributed)
    _, p_benign = shapiro(benign_values)
    _, p_malignant = shapiro(malignant_values)

    # Determine which test to use
    if p_benign > 0.05 and p_malignant > 0.05:  # Both distributions are normal
        test_type = "t-test"
        stat, p_value = ttest_ind(benign_values, malignant_values, equal_var=False)  # Welch's t-test
    else:
        test_type = "Mann-Whitney U-test"
        stat, p_value = mannwhitneyu(benign_values, malignant_values, alternative='two-sided')

    # Save results
    result_entry = {
        "Feature": feature, 
        "Test": test_type, 
        "p_value": p_value
    }
    results.append(result_entry)

    # Check for significance (p <= 0.05)
    if p_value <= 0.05:
        significant_entry = result_entry.copy()
        significant_entry['benign_mean'] = benign_values.mean()
        significant_entry['malignant_mean'] = malignant_values.mean()
        significant_entry['benign_std'] = benign_values.std()
        significant_entry['malignant_std'] = malignant_values.std()
        significant_features.append(significant_entry)

# Convert to DataFrames
results_df = pd.DataFrame(results)
significant_features_df = pd.DataFrame(significant_features)

# Multiple testing correction (False Discovery Rate - Benjamini-Hochberg)
_, p_corrected, _, _ = multipletests(results_df["p_value"], method='fdr_bh')
results_df["p_value_corrected"] = p_corrected

# Sort significant features by p-value
significant_features_df = significant_features_df.sort_values(by="p_value")

# Print and save results
print("\nTotal Significant Features:")
print(significant_features_df)

# Optional: Save to CSV
significant_features_df.to_csv('significant_features.csv', index=False)
print("\nSignificant features saved to 'significant_features.csv'")

# Quick summary
print(f"\nNumber of significant features: {len(significant_features_df)}")

ModuleNotFoundError: No module named 'statsmodels'

Total columns: 494

Numerical Columns:
Count: 493
['PREDICT_original_sf_compactness_avg_2.5D', 'PREDICT_original_sf_compactness_std_2.5D', 'PREDICT_original_sf_rad_dist_avg_2.5D', 'PREDICT_original_sf_rad_dist_std_2.5D', 'PREDICT_original_sf_roughness_avg_2.5D', 'PREDICT_original_sf_roughness_std_2.5D', 'PREDICT_original_sf_convexity_avg_2.5D', 'PREDICT_original_sf_convexity_std_2.5D', 'PREDICT_original_sf_cvar_avg_2.5D', 'PREDICT_original_sf_cvar_std_2.5D', 'PREDICT_original_sf_prax_avg_2.5D', 'PREDICT_original_sf_prax_std_2.5D', 'PREDICT_original_sf_evar_avg_2.5D', 'PREDICT_original_sf_evar_std_2.5D', 'PREDICT_original_sf_solidity_avg_2.5D', 'PREDICT_original_sf_solidity_std_2.5D', 'PREDICT_original_sf_area_avg_2.5D', 'PREDICT_original_sf_area_max_2.5D', 'PREDICT_original_sf_area_min_2.5D', 'PREDICT_original_sf_area_std_2.5D', 'PREDICT_original_sf_volume_2.5D', 'PREDICT_original_of_theta_x', 'PREDICT_original_of_theta_y', 'PREDICT_original_of_theta_z', 'PREDICT_original_hf_min', 'PRE