### Importing Data from UCI repo
###### Source : https://archive-beta.ics.uci.edu/ml/datasets/161

In [5]:
import pandas as pd

masses_data = pd.read_csv('data/mammographic_mass/mammographic_masses.data')
masses_data.head(10)

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0
5,4,70,?,?,3,0
6,5,42,1,?,3,0
7,5,57,1,5,3,1
8,5,60,?,5,1,1
9,5,76,1,4,3,1


Make sure you use the optional parameters in read_csv to convert missing data (indicated by a ?) into Nan, and to add the appropriate column names (BI_RADS, age, shape, margin, density and severity)

In [6]:
masses_data = pd.read_csv('data/mammographic_mass/mammographic_masses.data', na_values=['?'])
masses_data.head()

Unnamed: 0,5,67,3,5.1,3.1,1
0,4.0,43.0,1.0,1.0,,1
1,5.0,58.0,4.0,5.0,3.0,1
2,4.0,28.0,1.0,1.0,3.0,0
3,5.0,74.0,1.0,5.0,,1
4,4.0,65.0,1.0,,3.0,0


Add the appropriate column names (BI_RADS, age, shape, margin, density and severity)

In [7]:
masses_data = pd.read_csv('data/mammographic_mass/mammographic_masses.data', na_values=['?'], names=['BI_RADS', 'age', 'shape', 'margin', 'density', 'severity'])
masses_data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


Evaluate weather the data needs cleaning, your model is only as good as the data its given, use describe() on data frame

In [8]:
masses_data.describe()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


There are quite few missing values in the data set. Before we just drop every row that has missing data we will need to make sure we don't create any bias with doing that. You can do that by inspecting any correlation with the missing data. 

In [9]:
masses_data.loc[(masses_data['age'].isnull())|
                (masses_data['shape'].isnull())|
                (masses_data['margin'].isnull())|
                (masses_data['margin'].isnull())|
                (masses_data['density'].isnull())|
                (masses_data['severity'].isnull())].head(50)

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


Now we will try to drop the data to clean it and move forward with quality data

In [10]:
masses_data.dropna(inplace=True)
masses_data.describe()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


Now you will need to convery the pandas data frame into a numpy array that can be used for scikit_learn. Create an array that extracts only the feature data we want to work with (age, shape, margin and density) and another array that contain the classes(severity). You will also need an array to feature name basis

In [42]:
all_features = masses_data[['age', 'shape', 'margin', 'density']].values
feature_names = ['age', 'shape', 'margin', 'density']
print(all_features[1])

[58.  4.  5.  3.]


In [12]:
all_classes  = masses_data['severity'].values
all_classes

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,

Now we will need to normalize the data, for that we can use the scikitlearn's preprocessing method.
To know more about date normalization : https://www.import.io/post/what-is-data-normalization-and-why-is-it-important/

In [13]:
from sklearn import preprocessing

scalar = preprocessing.StandardScaler()
all_features_scaled = scalar.fit_transform(all_features)
all_features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

Now Lets setup an MLP model using keras

In [23]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

def create_model():
    model = Sequential()
    #4 feature input going into an 6-unit layer (more does not seems to help)
    model.add(Dense(6, input_dim=4, kernel_initializer='normal', activation='relu'))
    # Output layer with a binary classification 
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    #compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [24]:
from sklearn.model_selection import cross_val_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

estimator = KerasClassifier(build_fn=create_model, epochs=100, verbose=0)

cv_scores = cross_val_score(estimator, all_features_scaled, all_classes, cv=10)
cv_scores.mean()

2021-08-29 13:10:33.723883: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-08-29 13:10:33.766712: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2899885000 Hz




0.8