In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import linear_model as lm
import numpy as np

In [61]:
# 1
# read breast cancer data, which preducts if a tumor is benign or malignant based
# on 9 tumor features
df = pd.read_csv("breast_cancer.csv")

# drop the first column (0)
df = df.iloc[: , 1:]

# add headers to the columns for more description
df.columns = ['thickness', 'size_uniformity', 'shape_uniformity', 'marg_adh','epithelial_size', 
              'bare_nuclei', 'bland_chromatin', 'norm_nucleoli', 'mitosis', 'outcome']

# drop rows that contain a "?" as data
df = df[df['thickness'] != '?']
df = df[df['size_uniformity'] != '?']
df = df[df['shape_uniformity'] != '?']
df = df[df['marg_adh'] != '?']
df = df[df['epithelial_size'] != '?']
df = df[df['bare_nuclei'] != '?']
df = df[df['bland_chromatin'] != '?']
df = df[df['norm_nucleoli'] != '?']
df = df[df['mitosis'] != '?']
df = df[df['outcome'] != '?']

# for the outcomes column, assign values 2 -> 0 (benign) and 4 -> 1 (malignant)
# if value = 2, then set to 0 if value = 4, set to 1
df['outcome'] = df['outcome'].apply(lambda x:1 if x==2 else 0)

df.head()

Unnamed: 0,thickness,size_uniformity,shape_uniformity,marg_adh,epithelial_size,bare_nuclei,bland_chromatin,norm_nucleoli,mitosis,outcome
0,5,4,4,5,7,10,3,2,1,1
1,3,1,1,1,2,2,3,1,1,1
2,6,8,8,1,3,4,3,7,1,1
3,4,1,1,3,2,1,3,1,1,1
4,8,10,10,8,7,10,9,7,1,0


In [None]:
# drop rows that contain a "?" as data
def drop_missing_val_rows(data_frame):
    n=0
    colname = data_frame.columns[n] #issue is this doesn't return a string
    while n <= len(data_frame.axes[1]):
        data_frame = data_frame[data_frame[colname].str.contains("?")==False ]
        n+=1
    return data_frame

df = drop_missing_val_rows(df)

In [None]:
# 2
# function metrics(y, ypred) that takes in a series of actual y labels, and a series of predicted
# y labels, and returns the model accuracy, sensitivity, specificity, precision, f1-score
def metrics(y, ypred):
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0
    for i in range(len(y)):
        # if actual y=1 and predicted y=1,
        # it is a true positive
        if y[i] == 1:
            if ypred[i] == 1:
                true_pos += 1
                # if actual y=1 but predicted y=0,
                # it is a false negative
                else: 
                    false_neg += 1
                    
        # if actual y=0 and predicted y=0,
        # it is a true negative
        else: 
            if y[i] == 0:
                if ypred[i] == 0:
                    true_neg += 1
                    # if actual y=0 but predicted y=1
                    # it is a false positive
                    else:
                        false_pos += 1
    # accuracy: proportion of correct predictions
    accuracy = (true_neg + true_pos) / len(y)
    
    # sensitivity: proportion of positive cases that were actually identified
    sensitivity = true_pos / (true_pos + false_neg)
    
    # specificity: proportion of negative cases that were actually identified
    specificity = true_neg / (true_neg + false_pos)
    
    # precision: proportion of positive cases that were actually positive
    precision = true_pos / (true_pos + false_pos)
    
    # f1 score: measures the balance between precision and sensitivity
    # 0-1, with 1 being a perfect balance between precision and sensitivity
    f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)
    
    return accuracy, sensitivity, specificity, precision, f1_score

In [None]:
# 3
# function predict(X, w) where X is a 2D array of attributes values and w = {w0, w1,... w9} are
# the bias and coeffiients (weights) of model
# returns a series of predictions (1 or 0) per row in table of attributes
def predict(X, w):
    predictions = []
    for x in X:
        y_hat = w[0] # w0 is the bias
        for i in range(len(x)):
            y_hat += w[i + 1] * x[i]
        if y_hat >= 0:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions