In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score

In [33]:
%matplotlib inline
%config Completer.use_jedi = False

In [136]:
dataset_path = "data/mammographic_masses.data"
dataset_columns = ["BI-RADS_Assessment", "Age", "Shape", "Margin", "Density", "Severity"]

In [137]:
dataset = pd.read_csv(dataset_path, names=dataset_columns)

# 1. Preprocessing

## a. Data exploration

In [138]:
dataset.drop(columns=["BI-RADS_Assessment"], axis=1, inplace=True)

In [139]:
dataset.head()

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67,3,5,3,1
1,43,1,1,?,1
2,58,4,5,3,1
3,28,1,1,3,0
4,74,1,5,?,1


In [140]:
dataset.describe()

Unnamed: 0,Severity
count,961.0
mean,0.463059
std,0.498893
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## b. Missing values

In [141]:
dataset = dataset.replace("?", np.NaN).astype("float")

In [142]:
dataset.isna().sum()

Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [143]:
# alternative: dropping rows with missing values
# dataset.dropna(inplace=True)

In [144]:
# impute age and density cols (ordinal)
ordinal_cols = ["Age", "Density"]
for col in ordinal_cols:
    dataset[col] = dataset[col].fillna(dataset[col].mean().round(0))

In [145]:
# impute nominal cols
from sklearn.impute import SimpleImputer
simple_imputer = SimpleImputer(strategy="most_frequent")
imputed_data = simple_imputer.fit_transform(dataset.values)
dataset = pd.DataFrame.from_records(imputed_data, columns=dataset.columns)

In [146]:
all(dataset.notnull())

True

## c. Nominal data

In [147]:
# Dummy encoding
nominal_cols = ["Shape", "Margin"]
dataset = pd.concat([dataset, pd.get_dummies(dataset[nominal_cols].astype(str), drop_first=True)], axis=1)
dataset.drop(columns=nominal_cols, axis=1, inplace=True)

## d. Label extraction

In [148]:
y_labels = ["Severity"]
X_features = [col for col in dataset.columns if col not in y_labels]
X = dataset[X_features].values
y = dataset[y_labels].values.reshape(-1,)

In [149]:
print(f"X: {X.shape}, y: {y.shape}")

X: (961, 9), y: (961,)


## e. Feature scaling

In [150]:
from sklearn.preprocessing import StandardScaler

standard_scalar = StandardScaler()
X_scaled = standard_scalar.fit_transform(X)

In [151]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
X_scaled_mm = min_max_scaler.fit_transform(X)

# 2. Model Selection

In [50]:
from sklearn import tree

decision_tree_clf = tree.DecisionTreeClassifier(random_state=0)
score_ = cross_val_score(decision_tree_clf, X_scaled, y, cv=10)
score_.mean()

0.7378329037800688

In [51]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=0)
score_ = cross_val_score(random_forest_clf, X_scaled, y, cv=10)
score_.mean()

0.7596649484536082

In [52]:
from sklearn import svm

kernels = ["linear", "rbf", "poly", "sigmoid"]
for kernel in kernels:
    svm_clf = svm.SVC(kernel=kernel)
    mean_score_ = cross_val_score(svm_clf, X_scaled, y, cv=10).mean()
    print(kernel, mean_score_)

linear 0.7856314432989692
rbf 0.786683848797251
poly 0.7866838487972508
sigmoid 0.7253436426116838


In [53]:
from sklearn import neighbors

best_n = 0
best_score = -1
for n in range(2, 40):
    knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n)
    mean_score_ = cross_val_score(knn_clf, X_scaled, y, cv=10).mean()
    if mean_score_ > best_score:
        best_n = n
        best_score = mean_score_
print(best_n, best_score)

9 0.7939647766323025


In [54]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB()
score_ = cross_val_score(nb_clf, X_scaled_mm, y, cv=10)
score_.mean()

0.7877040378006873

In [152]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K

def gaussian(x):
    return K.exp(-K.pow(x,2))

def setup_model():
    model = Sequential()
    model.add(Dense(19, input_dim=9, kernel_initializer="normal", activation="relu"))
    model.add(Dense(1, kernel_initializer="normal", activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
    return model

In [153]:
model = setup_model()
model.summary()

Model: "sequential_183"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_431 (Dense)            (None, 19)                190       
_________________________________________________________________
dense_432 (Dense)            (None, 1)                 20        
Total params: 210
Trainable params: 210
Non-trainable params: 0
_________________________________________________________________


In [154]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
nn_clf = KerasClassifier(build_fn=setup_model, nb_epoch=100, verbose=1)
score_ = cross_val_score(nn_clf, X_scaled, y, cv=10)
score_.mean()



0.768932557106018