In [38]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [39]:
from sklearn.impute import SimpleImputer

data = pd.read_csv("../resource/asnlib/publicdata/cleveland.data.csv")

# PART 1. Clean the data as described in the document. Nothing will be auto-graded here.

###
### YOUR CODE HERE
###

# data.dtypes

# taking care of missing values(using very high value to be ignored)
imp = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=10000)
np_2 = imp.fit_transform(data)

# back to pandas dataframe
column_values = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 
                 'slope', 'ca', 'thal', 'num'] 

df_2 = pd.DataFrame(data=np_2,    
                  columns=column_values)

# converting 'num' to binary
df_2['num'] = (df_2['num'] > 0).astype(int)

df_3 = df_2

# getting rid of rows with '?' value
df_4 = df_3.replace("?", np.nan).dropna()

# standardizing values
num = df_4['num'].to_numpy()
df_5 = df_4.drop(['num'], axis=1)

scaler = StandardScaler()
df_stand = scaler.fit_transform(df_5)

# split data into testing and training
X_train, X_test, y_train, y_test = train_test_split(df_stand, num, test_size=0.1, random_state=0)

In [40]:
# PART 2. Build the classifiers.
# As the document describes, we expect a 4x4 Numpy array.
# Store that in a variable called `all_models`.
# Note: don't worry if you get a `ConvergenceWarning`.
# Note: remember to set `random_state=0` for the train/test split AND the classifier models.

###
### YOUR CODE HERE
###

all_models = np.empty((4,4))

clf = MLPClassifier(random_state=0, max_iter=1000).fit(X_train, y_train)

# function for creating classifiers
def metrics(model):
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score (y_test, y_pred)
    recall = recall_score (y_test, y_pred)
    f1 = f1_score (y_test, y_pred)
    
    matrix = np.array([accuracy, precision, recall, f1])
    
    print(matrix)
    
    return matrix

all_models[0] = metrics(MLPClassifier(hidden_layer_sizes=(5), activation='logistic', random_state=0, max_iter=1000).fit(X_train, y_train))
all_models[1] = metrics(MLPClassifier(hidden_layer_sizes=(10,10), activation='logistic', random_state=0, max_iter=1000).fit(X_train, y_train))
all_models[2] = metrics(MLPClassifier(hidden_layer_sizes=(5), activation='relu', random_state=0, max_iter=1000).fit(X_train, y_train))
all_models[3] = metrics(MLPClassifier(hidden_layer_sizes=(10,10), activation='relu', random_state=0, max_iter=1000).fit(X_train, y_train))

all_models

[0.8        0.9        0.64285714 0.75      ]
[0.8        0.9        0.64285714 0.75      ]
[0.73333333 0.8        0.57142857 0.66666667]
[0.76666667 0.88888889 0.57142857 0.69565217]




array([[0.8       , 0.9       , 0.64285714, 0.75      ],
       [0.8       , 0.9       , 0.64285714, 0.75      ],
       [0.73333333, 0.8       , 0.57142857, 0.66666667],
       [0.76666667, 0.88888889, 0.57142857, 0.69565217]])

In [41]:
assert all_models.shape == (4,4)

###
### AUTOGRADER TEST - DO NOT REMOVE
###


In [42]:
# PART 3. Discussion questions to be answered in a block comment in this cell.

# My code produced a ConvergenceWarning, indicating that the specified number of maximum
# iterations was not high enough for at least one of the models' optimization to converge.
# This means that the learning has not stabilized yet. When I commented out the models which
# have two layers of ten neurons each, this warning disappeared. This gives me cause to
# believe that this warning can arise when a model is too complex for the given context/data.

# In terms of accuracy, both the logistic models performed the best at accuracy = 0.8. This
# score might be considered good in some cases, but in the case of predicting heart disease
# or heart failure, it's really important to have this metric be as high as possible. Also,
# it is hard to separate one metric from the others, since accuracy only measures how many
# positive cases are correctly predicted. It doesn't account for the amount of heart disease
# cases that are missed, which is crucial. Focusing on accuracy alone, I do not believe we
# can say that it is good enough. Then given the context of the other metrics, I would say
# that it is certainly not good enough.

In [91]:
# PART 4 (OPTIONAL). 
# As the document describes, we expect a 1x4 Numpy array.
# Store that in a variable called `top_model`.

###
### YOUR CODE HERE
###

top_model = np.empty((1,4))

top = metrics(MLPClassifier(
    hidden_layer_sizes=(9), 
    activation='relu', 
    random_state=0, 
    max_iter=1000).fit(X_train, y_train))

top_model[0] = top

# This architecture produces an accuracy score of about 0.83, which is the highest so far.

[0.83333333 0.90909091 0.71428571 0.8       ]




In [90]:
assert top_model.shape == (1,4)

# Note: we check if the array makes sense, but we will also look at your code manually. 

###
### AUTOGRADER TEST - DO NOT REMOVE
###
