In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.metrics import confusion_matrix

####################################################################

# Problem 1

####################################################################

###########################
# Importing Dataset
###########################

dataset = pd.read_csv('/Users/benjaminslay/Downloads/diabetes.csv')
dataset.head()

###########################
# Preprocessing
###########################

X = dataset.drop('Outcome', axis=1)
Y = dataset['Outcome']

# Spliting dataset into Training Data and Test Data with an 80% and 20% split between training and evaluation.
# Training data is used to train our Logistic model.
# Test data will be used to validate our model.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature scaling to scale data between 0 and 1 to get better accuracy.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

# Make an instance classifier with 1000 max iterations and random state of 0
classifier = LogisticRegression(max_iter=1000, random_state=0)
classifier.fit(X_train, Y_train)

###########################
# Results
###########################

# Predict the Test set results
Y_pred = classifier.predict(X_test)

# Evaluate the model using model evaluation metrics accuracy, precision, recall, and F1 score
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print("Recall:",metrics.recall_score(Y_test, Y_pred))
print("F1 Score:",metrics.f1_score(Y_test, Y_pred))

###########################
# Training Results
###########################

# Number of iterations
accuracy = []
iterations = classifier.n_iter_[0]
loss = []

for i in range(1, iterations + 1):
    classifier = LogisticRegression(solver='liblinear', max_iter=i, random_state=0)
    classifier.fit(X_train, Y_train)
    Y_train_prediction = classifier.predict(X_train)
    accuracy.append(metrics.accuracy_score(Y_train, Y_train_prediction))
    loss.append(-classifier.score(X_train, Y_train))

###########################
# Plotting Training Results
###########################

fig, axis1 = plt.subplots()

axis2 = axis1.twinx()
axis1.plot(range(1, iterations + 1), loss, 'r-')
axis2.plot(range(1, iterations + 1), accuracy, 'g-')

axis1.set_xlabel('Iteration')
axis1.set_ylabel('Loss', color='r')
axis2.set_ylabel('Accuracy', color='g')

plt.title('Training Results - Accuracy and Loss over Iterations')

###########################
# Confusion Matrix
###########################

cnf_matrix = confusion_matrix(Y_test, Y_pred)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='d', cmap='summer', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')



####################################################################

# Problem 2 - Part 1

####################################################################

###########################
# Importing new Dataset
###########################

dataset = pd.read_csv('/Users/benjaminslay/Downloads/cancer.csv')
dataset.head()

###########################
# Preprocessing
###########################

# Cleaning data
# Drop the 'Unnamed: 32' column
dataset.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

# Mapping M and B values of the diagnosis to 1 and 0
dataset['diagnosis'] = dataset['diagnosis'].map({'M': 1, 'B': 0})

X = dataset.drop('diagnosis', axis=1)
Y = dataset['diagnosis']

# Spliting dataset into Training Data and Test Data with an 80% and 20% split between training and evaluation.
# Training data is used to train our Logistic model.
# Test data will be used to validate our model.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature scaling to scale data between 0 and 1 to get better accuracy.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

# Make an instance classifier for a linear regression with 1000 max iterations and random state of 0
classifier = LogisticRegression(max_iter=1000, random_state=0)
classifier.fit(X_train, Y_train)

###########################
# Results
###########################

# Predict the Test set results
Y_pred = classifier.predict(X_test)

# Evaluate the model using model evaluation metrics accuracy, precision, recall, and F1 score
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print("Recall:",metrics.recall_score(Y_test, Y_pred))
print("F1 Score:",metrics.f1_score(Y_test, Y_pred))

###########################
# Training Results
###########################

accuracy = []
iterations = classifier.n_iter_[0]
loss = []

for i in range(1, iterations + 1):
    classifier = LogisticRegression(solver='liblinear', max_iter=i, random_state=0)
    classifier.fit(X_train, Y_train)
    Y_train_prediction = classifier.predict(X_train)
    accuracy.append(metrics.accuracy_score(Y_train, Y_train_prediction))
    loss.append(-classifier.score(X_train, Y_train))

###########################
# Plotting Training Results
###########################

fig, axis1 = plt.subplots()

axis2 = axis1.twinx()
axis1.plot(range(1, iterations + 1), loss, 'r-')
axis2.plot(range(1, iterations + 1), accuracy, 'g-')

axis1.set_xlabel('Iteration')
axis1.set_ylabel('Loss', color='r')
axis2.set_ylabel('Accuracy', color='g')

plt.title('Training Results - Accuracy and Loss over Iterations')

###########################
# Confusion Matrix
###########################

cnf_matrix = confusion_matrix(Y_test, Y_pred)

class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='d', cmap='summer', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

####################################################################

# Problem 2 - Part 2 - Weight Penalty

####################################################################

classifier_l2 = LogisticRegression(max_iter=1000, penalty = 'l2', C=1.0, random_state=0)
classifier_l2.fit(X_train, Y_train)

###########################
# Results
###########################

# Predict the Test set results
Y_pred = classifier.predict(X_test)

# Evaluate the model using model evaluation metrics accuracy, precision, recall, and F1 score
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print("Recall:",metrics.recall_score(Y_test, Y_pred))
print("F1 Score:",metrics.f1_score(Y_test, Y_pred))

###########################
# Training Results
###########################

accuracy = []
iterations = classifier.n_iter_[0]
loss = []

for i in range(1, iterations + 1):
    classifier = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, max_iter=i, random_state=42)
    classifier.fit(X_train, Y_train)
    Y_train_prediction = classifier.predict(X_train)
    accuracy.append(metrics.accuracy_score(Y_train, Y_train_prediction))
    loss.append(-classifier.score(X_train, Y_train))

###########################
# Plotting Training Results
###########################

fig, axis1 = plt.subplots()

axis2 = axis1.twinx()
axis1.plot(range(1, iterations + 1), loss, 'r-')
axis2.plot(range(1, iterations + 1), accuracy, 'g-')

axis1.set_xlabel('Iteration')
axis1.set_ylabel('Loss', color='r')
axis2.set_ylabel('Accuracy', color='g')

plt.title('Training Results - Accuracy and Loss over Iterations')

###########################
# Confusion Matrix
###########################

# Confusion Matrix
cnf_matrix = confusion_matrix(Y_test, Y_pred)

#Let's visualize the results of the model in the form of a confusion matrix using matplotlib and seaborn.
#Here, you will visualize the confusion matrix using Heatmap.

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='d', cmap='summer', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
ax.xaxis.set_label_position("top")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')



####################################################################

# Problem 3

####################################################################

# Spliting dataset into Training Data and Test Data with an 80% and 20% split between training and evaluation.
# Training data is used to train our Logistic model.
# Test data will be used to validate our model.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature scaling to scale data between 0 and 1 to get better accuracy.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

# Make an instance classifier for a Naive Bayes
classifier = GaussianNB()
classifier.fit(X_train, Y_train)

##################
# Results
##################

# Predict the Test set results
Y_pred = classifier.predict(X_test)

# Evaluate the model using model evaluation metrics accuracy, precision, recall, and F1 score
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precision = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
f1_score = metrics.f1_score(Y_test, Y_pred)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1 Score:",f1_score)

# Plot the evaluation metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values = [accuracy, precision, recall, f1_score]

plt.figure(figsize=(14, 6))
plt.bar(metrics, values, color=['blue', 'green', 'red', 'purple'])
plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Naive Bayes Classifier Performance Metrics')
plt.show()



####################################################################

# Problem 4

####################################################################

# Spliting dataset into Training Data and Test Data with an 80% and 20% split between training and evaluation.
# Training data is used to train our Logistic model.
# Test data will be used to validate our model.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature scaling to scale data between 0 and 1 to get better accuracy.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

from sklearn import metrics

accuracy_list = []
precision_list = []
recall_list = []
f1_score_list = []

accuracy = 0
precision = 0
recall = 0
f1_score = 0

K_iterations = range(1, X_train.shape[1] + 1)

for k in K_iterations:
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    classifier = LogisticRegression(max_iter=1000, random_state=0)
    classifier.fit(X_train_pca, Y_train)

    Y_pred = classifier.predict(X_test_pca)

    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    precision = metrics.precision_score(Y_test, Y_pred)
    recall = metrics.recall_score(Y_test, Y_pred)
    f1_score = metrics.f1_score(Y_test, Y_pred)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1_score)

K_value = K_iterations[np.argmax(np.array([accuracy, precision, recall, f1_score]))]

print("K =", K_value)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print()

# Plot the evaluation metrics over different values of K
plt.figure(figsize=(16, 10))

plt.plot(K_iterations, accuracy_list, label='Accuracy', marker='o')
plt.plot(K_iterations, precision_list, label='Precision', marker='o')
plt.plot(K_iterations, recall_list, label='Recall', marker='o')
plt.plot(K_iterations, f1_score_list, label='F1 Score', marker='o')

plt.xlabel('Number of Principal Components (K)')
plt.ylabel('Score')
plt.title('Evaluation Metrics vs. Number of Principal Components (K)')
plt.grid(True)
plt.xticks(K_iterations)
plt.legend()
plt.show()



####################################################################

# Problem 5

####################################################################

# Spliting dataset into Training Data and Test Data with an 80% and 20% split between training and evaluation.
# Training data is used to train our Logistic model.
# Test data will be used to validate our model.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Feature scaling to scale data between 0 and 1 to get better accuracy.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

accuracy_list = []
precision_list = []
recall_list = []
f1_score_list = []

accuracy = 0
precision = 0
recall = 0
f1_score = 0

K_iterations = range(1, X_train.shape[1] + 1)

for k in K_iterations:
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    classifier = GaussianNB()
    classifier.fit(X_train_pca, Y_train)

    Y_pred = classifier.predict(X_test_pca)

    accuracy = metrics.accuracy_score(Y_test, Y_pred)
    precision = metrics.precision_score(Y_test, Y_pred)
    recall = metrics.recall_score(Y_test, Y_pred)
    f1_score = metrics.f1_score(Y_test, Y_pred)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1_score)

K_value = K_iterations[np.argmax(np.array([accuracy, precision, recall, f1_score]))]

print("K =", K_value)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print()

# Plot the evaluation metrics over different values of K
plt.figure(figsize=(16, 10))

plt.plot(K_iterations, accuracy_list, label='Accuracy', marker='o')
plt.plot(K_iterations, precision_list, label='Precision', marker='o')
plt.plot(K_iterations, recall_list, label='Recall', marker='o')
plt.plot(K_iterations, f1_score_list, label='F1 Score', marker='o')

plt.xlabel('Number of Principal Components (K)')
plt.ylabel('Score')
plt.title('Evaluation Metrics vs. Number of Principal Components (K)')
plt.legend()
plt.grid(True)
plt.xticks(K_iterations)
plt.show()

KeyError: "['Outcome'] not found in axis"