# Task 1: Download and import dataset

In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv('mammographic_masses_data.csv')

# Task 2: Get dataset on screen

In [3]:
# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [4]:
# Get a concise summary of a DataFrame and the missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BA        959 non-null    float64
 1   Age       956 non-null    float64
 2   Shape     930 non-null    float64
 3   Margin    913 non-null    float64
 4   Density   885 non-null    float64
 5   Severity  961 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 45.2 KB


In [5]:
# Render out summary statistics of the DataFrame
df.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [6]:
# Using loc show the margin attribute of every instance of the data where the severity is 1
df.loc[df['Severity'] == 1, 'Margin']

0      5.0
1      1.0
2      5.0
4      5.0
8      5.0
      ... 
951    5.0
952    4.0
955    4.0
957    5.0
959    5.0
Name: Margin, Length: 445, dtype: float64

In [7]:
import plotly.express as px

In [8]:
# Render out a scatter plot of the data using plotly express with the shape attribute on the x axis and the age attribute on the y axis. Color the points by severity
fig = px.box(df, x='Shape', y='Age', color='Severity', 
                     title='Age Distribution by Shape and Severity',
                     labels={'Shape': 'Mass Shape', 'Age': 'Patient Age'})
fig.show()

# Render out a histogram of the age attribute using plotly express
fig = px.histogram(df, x='Age', title='Histogram of Age')
fig.show()

# Render out a correlation matrix of the dataset using plotly express
corr = df.corr()
fig = px.imshow(corr, text_auto=True, title='Correlation Matrix')
fig.show()

# Task 3: Preprocessing

In [9]:
df.isnull().sum()

BA           2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [10]:
# use a copy of the dataframe to preprocess the data
df_copy = df.copy()

In [11]:
df_copy = df_copy.dropna(subset=['BA'])

In [12]:
df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)

In [13]:
df_copy = df_copy.dropna(subset=['Shape', 'Margin', 'Density'])

In [14]:

# Render plots showing the difference between a cleaned and uncleaned dataset. 
fig1 = px.histogram(df, x='Age', title='Histogram of Age (Original Data)')
fig2 = px.histogram(df_copy, x='Age', title='Histogram of Age (Cleaned Data)')

fig1.show()
fig2.show()

In [15]:
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_col = (column - min_val) / (max_val - min_val)
    return normalized_col


In [16]:
df_copy_normalized = df_copy.copy()

In [17]:

df_copy_normalized['Age'] = normalize(df_copy['Age'])
df_copy_normalized['BA'] = normalize(df_copy['BA'])
df_copy_normalized['Density'] = normalize(df_copy['Density'])

In [18]:
df_copy_normalized

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
0,0.833333,0.628205,3.0,5.0,0.666667,1
2,0.833333,0.512821,4.0,5.0,0.666667,1
3,0.666667,0.128205,1.0,1.0,0.666667,0
8,0.833333,0.500000,1.0,5.0,0.666667,1
10,0.833333,0.743590,1.0,4.0,0.666667,1
...,...,...,...,...,...,...
956,0.666667,0.371795,2.0,1.0,0.666667,0
957,0.666667,0.487179,4.0,5.0,0.666667,1
958,0.666667,0.589744,4.0,5.0,0.666667,0
959,0.833333,0.615385,4.0,5.0,0.666667,1


In [19]:
# Render plots showing the difference between a cleaned and uncleaned dataset. 
fig1 = px.histogram(df_copy, x='Age', title='Histogram of Age (Original Data)')
fig2 = px.histogram(df_copy_normalized, x='Age', title='Histogram of Age (Normalized Data)')
fig1.show()
fig2.show()

# Task 4: Feature engineering


In [20]:
# Import required modules
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go

# Prepare data for feature selection
df_encoded = df_copy_normalized.copy()

# Encode categorical variables (SelectKBest needs numerical data)
le_shape = LabelEncoder()
le_margin = LabelEncoder() 
le_density = LabelEncoder()

df_encoded['Shape'] = le_shape.fit_transform(df_encoded['Shape'].astype(str))
df_encoded['Margin'] = le_margin.fit_transform(df_encoded['Margin'].astype(str))
df_encoded['Density'] = le_density.fit_transform(df_encoded['Density'].astype(str))

# Separate features (X) and target (y)
X = df_encoded.drop('Severity', axis=1)
y = df_encoded['Severity']

# Apply SelectKBest to find top 3 features
selector = SelectKBest(score_func=f_classif, k=3)
X_selected = selector.fit_transform(X, y)

# Get results
feature_scores = selector.scores_
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]
eliminated_features = X.columns[~selected_mask]

# Calculate elimination threshold
threshold = sorted(feature_scores, reverse=True)[2]  # 3rd highest score = cutoff

# Print results
print("Feature Selection Results:")
print("Selected Features:", list(selected_features))
print("Eliminated Features:", list(eliminated_features))
print(f"Elimination Threshold: {threshold:.2f}")

# Plot 1: Feature importance with selection status
selection_status = ['Selected' if f in selected_features else 'Eliminated' for f in X.columns]

fig1 = px.bar(x=X.columns, y=feature_scores,
              color=selection_status,
              title='Feature Selection Results - F-Scores',
              labels={'x': 'Features', 'y': 'F-Score'},
              color_discrete_map={'Selected': 'green', 'Eliminated': 'red'})
fig1.show()

# Plot 2: Elimination threshold with clear above/below distinction
fig2 = go.Figure()

fig2.add_trace(go.Bar(name='Above Threshold (Selected)', 
                      x=selected_features, 
                      y=[feature_scores[list(X.columns).index(f)] for f in selected_features],
                      marker_color='green',
                      opacity=0.8))

fig2.add_trace(go.Bar(name='Below Threshold (Eliminated)', 
                      x=eliminated_features, 
                      y=[feature_scores[list(X.columns).index(f)] for f in eliminated_features],
                      marker_color='red',
                      opacity=0.8))

# Add threshold line
fig2.add_hline(y=threshold, line_dash="dash", line_color="black", line_width=2,
               annotation_text=f"Elimination Threshold: {threshold:.1f}")

fig2.update_layout(title='Feature Elimination: Above vs Below Threshold',
                   xaxis_title='Features', 
                   yaxis_title='F-Score')
fig2.show()

# Plot 3: Performance improvement metrics
avg_all = feature_scores.mean()
avg_selected = feature_scores[selected_mask].mean()
improvement_pct = ((avg_selected - avg_all) / avg_all) * 100

metrics = ['Average F-Score (All)', 'Average F-Score (Selected)']
values = [avg_all, avg_selected]

fig3 = px.bar(x=metrics, y=values,
              title='Feature Selection Performance Improvement',
              labels={'x': 'Metrics', 'y': 'Score/Percentage'},
              color=metrics,
              color_discrete_sequence=['lightcoral', 'lightgreen', 'gold'])

# Add value labels on bars
fig3.update_traces(texttemplate='%{y:.1f}', textposition='outside')
fig3.show()

# Summary stats
print(f"\nPerformance Improvement:")
print(f"Average F-Score (All Features): {avg_all:.2f}")
print(f"Average F-Score (Selected): {avg_selected:.2f}")
print(f"Improvement: {improvement_pct:.1f}%")
print(f"Features Eliminated: {len(eliminated_features)}/{len(X.columns)} ({len(eliminated_features)/len(X.columns)*100:.0f}%)")

Feature Selection Results:
Selected Features: ['BA', 'Shape', 'Margin']
Eliminated Features: ['Age', 'Density']
Elimination Threshold: 362.84



Performance Improvement:
Average F-Score (All Features): 277.86
Average F-Score (Selected): 389.79
Improvement: 40.3%
Features Eliminated: 2/5 (40%)


In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA with same number of components as selected features
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

print(f"PCA reduced data shape: {X_pca.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.3f}")

# PCA components visualization
components_df = pd.DataFrame(pca.components_.T, 
                           columns=[f'PC{i+1}' for i in range(3)],
                           index=X.columns)

fig = px.imshow(components_df.T, 
                 text_auto=True, 
                 aspect="auto",
                 title="PCA Components Heatmap",
                 labels={'x': 'Original Features', 'y': 'Principal Components'})
fig.show()


PCA reduced data shape: (835, 3)
Explained variance ratio: [0.48379565 0.1970085  0.13888685]
Total explained variance: 0.820


In [23]:
import plotly.express as px
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

df_encoded2 = df_copy.copy()
scaler = StandardScaler()

# Separate features (X) and target (y)
X = df_encoded.drop('Severity', axis=1)
y = df_encoded['Severity']

X_scaled = scaler.fit_transform(X)

# Truncated SVD
svd = TruncatedSVD(n_components=3)
reduced_data = svd.fit_transform(X_scaled)

# Hoeveel van de data overgenomen wordt in de nieuwe features
print("Explained variance ratio:", svd.explained_variance_ratio_)
print("Total explained variance:", svd.explained_variance_ratio_.sum())

explained = svd.explained_variance_ratio_

# Scree plot (bar chart)
fig1 = px.bar(
    x=list(range(1, len(explained)+1)),
    y=explained,
    labels={'x': "Component", 'y': "Explained Variance Ratio"},
    title="Scree Plot - Truncated SVD"
)
fig1.show()

# Scatter plot (eerste 2 componenten)
fig2 = px.scatter(
    x=reduced_data[:,0],
    y=reduced_data[:,1],
    color=y,
    labels={"x": "SVD Component 1", "y": "SVD Component 2", "color": "Severity"},
    title="SVD Reduced Data (first 2 components)",
    color_continuous_scale="viridis"
)
fig2.show()


Explained variance ratio: [0.48379565 0.1970085  0.13888685]
Total explained variance: 0.8196909999354122
