In [None]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
# Display all columns
pd.set_option('display.max_columns', None)
# Customize column names
col_names = ['ID','Diagnosis','radius','texture','perimeter','area','smoothness','compactness',
             'concavity','concave_pts','symmetry','fractal_dim','radius_std','texture_std',
             'perimeter_std','area_std', 'smoothness_std','compactness_std','concavity_std',
             'concave_pts_std','symmetry_std','fractal_dim_std','radius_ext','texture_ext',
             'perimeter_ext','area_ext','smoothness_ext','compactness_ext','concavity_ext',
             'concave_pts_ext','symmetry_ext','fractal_dim_ext']
# Import data
df = pd.read_table('wdbc.data',sep=',',names = col_names)
df.head()

In [None]:
# Examine data info and shape
display(df.info(), df.shape)

In [None]:
# Check for duplicate IDs
df['ID'].value_counts(ascending = False).head()

In [None]:
# Check class balance
df.Diagnosis.value_counts()

In [None]:
# Check for 2 classes
df.Diagnosis.unique()

In [None]:
# Pie chart for class balance
plt.rcParams['font.size'] = 12
plt.pie(df.Diagnosis.value_counts(), colors = ['darkred','skyblue'], shadow = True,
        labels = ['M: 63%','B: 37%']);

In [None]:
# Check for null
df.isnull().sum()

In [None]:
# Also check for 0 values
df[df.eq(0).any(axis = 1)]

Represents 13/569 = 2.3% of sample, all B (false) cases. Not in the class of interest so will leave as is.

In [None]:
# Scale numerical features to examine distribution
num_cols = [i for i in col_names if i != 'Diagnosis']

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df.drop('Diagnosis', axis = 1))
scaled_df = pd.DataFrame(scaled_df, columns=num_cols)
scaled_df

In [None]:
# Plot scaled feature distributions
plt.rcParams['font.size'] = 14

means = ['radius','texture','perimeter','area','smoothness','compactness','concavity',
         'concave_pts','symmetry','fractal_dim']
stds = ['radius_std','texture_std','perimeter_std','area_std', 'smoothness_std',
        'compactness_std','concavity_std','concave_pts_std','symmetry_std','fractal_dim_std']
exts = ['radius_ext','texture_ext','perimeter_ext','area_ext','smoothness_ext',
        'compactness_ext','concavity_ext','concave_pts_ext','symmetry_ext','fractal_dim_ext']

df_means = pd.melt(scaled_df, id_vars = 'ID', var_name = 'Means', value_vars = means)
df_stds = pd.melt(scaled_df, id_vars = 'ID', var_name = 'Std errors', value_vars = stds)
df_exts = pd.melt(scaled_df, id_vars = 'ID', var_name = 'Extremes', value_vars = exts)
         
fig,ax = plt.subplots(3, 1, figsize = (18, 38))
fig.subplots_adjust(wspace = 10)

sns.boxplot(ax = ax[0], x = 'Means', y = 'value', data = df_means)
ax[0].set_title('Feature means')
ax[0].tick_params(labelrotation = 30)

sns.boxplot(ax = ax[1], x = 'Std errors', y = 'value', data = df_stds)
ax[1].set_title('Feature standard errors')
ax[1].tick_params(labelrotation = 30)

sns.boxplot(ax = ax[2], x = 'Extremes', y = 'value', data = df_exts)
ax[2].set_title('Feature extremes')
ax[2].tick_params(labelrotation = 30)

for p in [0, 1, 2]:
    ax[p].set_ylabel('Spread')
    ax[p].set_xlabel('Feature')

plt.show();

In [None]:
# Plot unscaled feature distributions
df_means = pd.melt(df, id_vars = 'ID', var_name = 'Means', value_vars = means)
df_stds = pd.melt(df, id_vars = 'ID', var_name = 'Std errors', value_vars = stds)
df_exts = pd.melt(df, id_vars = 'ID', var_name = 'Extremes', value_vars = exts)
         
fig,ax = plt.subplots(3, 1, figsize = (18, 38))
fig.subplots_adjust(wspace = 10)

sns.boxplot(ax = ax[0], x = 'Means', y = 'value', data = df_means)
ax[0].set_title('Feature means')
ax[0].tick_params(labelrotation = 30)

sns.boxplot(ax = ax[1], x = 'Std errors', y = 'value', data = df_stds)
ax[1].set_title('Feature standard errors')
ax[1].tick_params(labelrotation = 30)

sns.boxplot(ax = ax[2], x = 'Extremes', y = 'value', data = df_exts)
ax[2].set_title('Feature extremes')
ax[2].tick_params(labelrotation = 30)

for p in [0, 1, 2]:
    ax[p].set_ylabel('Spread')
    ax[p].set_xlabel('Feature')

plt.show();