In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("../data/dataset1.csv")

In [3]:
df = df.drop("id", axis=1)

In [4]:
df["diagnosis"] = df["diagnosis"].map({"M": 1, "B": 0})

In [5]:
mean = np.mean(df, axis=0)
for i in range(1,df.shape[1]):
    df.iloc[:, i].fillna(mean[i-1], inplace=True)

0.37258347978910367
14.116125000000011
19.28964850615117
92.02346830985917
654.8891036906857
0.096360281195079
0.10434098418277686
0.08892480757042255
0.048919145869947236
0.181161862917399
0.06279760984182778
0.4051720562390161
1.2168534270650269
2.8660592267135288
40.33707908611603
0.007040978910369071
0.02546582922535212
0.031893716344463946
0.011796137082601056
0.020542298769771532
0.0037949038664323383
16.269189806678394
25.677223198594014
107.2612126537786
881.4024691358021
0.13236859402460469
0.25426504393673144
0.27245536443661955
0.11460622319859404
0.29007557117750454


In [6]:
df.isnull().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [7]:
y = df["diagnosis"]
X = df.drop(["diagnosis"], axis=1)
mean = np.mean(X, axis=0)
stddev = np.std(X, axis=0)
X = (X - mean) / stddev
df = pd.concat([X, y], axis=1)

In [8]:
y = df["diagnosis"]
X = df.drop(["diagnosis"], axis=1)
split_idx = int(len(df) * 0.67)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [10]:
X_train = X_train.to_numpy()

In [11]:
X_test = X_test.to_numpy()

In [12]:
X_train = X_train.T
X_test = X_test.T

In [13]:
def sigmoid(a):
    return 1 / (1 + np.exp(-a))

In [14]:
def calc_cost(m, Y, A):
    return -(1 / m) * np.sum(Y * (np.log(A)) + (1 - Y) * (np.log(1 - A)))

In [15]:
def calc_weights(w, X):
    A = sigmoid(np.dot(w.T, X))
    return A

In [16]:
def update_weights(m, A, X, Y):
    dw = (1 / m) * np.dot(X, (A - Y).T)
    return dw

In [17]:
def gradient_descent(w, X, Y, num_iterations, learning_rate):
    costs = []
    m = X.shape[1]
    for i in range(num_iterations):
        A = calc_weights(w, X)
        cost = calc_cost(m, Y, A)
        dw = update_weights(m, A, X, Y)
        w = w - learning_rate * dw
        costs.append(cost)
    return w, dw, costs

In [18]:
def prediction(w, X, Y):
    m = X.shape[1]
    print(Y)
    predict = np.zeros((1, m))
    A = np.dot(w.T, X)
    Y_prediction = np.zeros((1, m))

    count = 0
    for i in range(1, m):
        if A[0][i] <= 0.5:
            Y_prediction[0][i] = 0
        else:
            Y_prediction[0][i] = 1

    for i in range(1, m):
        if Y_prediction[0][i] == Y[i]:
            count += 1

    return (count / X.shape[1]) * 100

In [19]:
def logistic_regression(
    X_train, Y_train, X_test, Y_test, learning_rate=0.005, num_iterations=1000
):
    w = np.zeros([X_train.shape[0], 1])
    w, dw, costs = gradient_descent(w, X_train, Y_train, num_iterations, learning_rate)
    train_accuracy = prediction(w, X_train, Y_train)
    test_accuracy = prediction(w, X_test, Y_test)
    return train_accuracy, test_accuracy, costs

In [20]:
X_train

array([[ 1.09564189,  1.82081766,  1.57347089, ..., -0.12141667,
        -0.84659244, -0.79318803],
       [-2.07333501, -0.35363241,  0.45618695, ..., -0.963324  ,
        -0.10696329, -1.4729517 ],
       [ 1.26483041,  1.67819563,  1.55950661, ..., -0.14838751,
        -0.76106843, -0.76679824],
       ...,
       [ 2.29607613,  1.0870843 ,  1.95500035, ..., -0.14017945,
         2.09813036,  0.26180272],
       [ 2.75062224, -0.24388967,  1.152255  , ...,  0.78663658,
         2.02747431,  0.71545423],
       [ 1.93701461,  0.28118999,  0.20139121, ...,  0.6890504 ,
         3.12291311,  0.4546415 ]])

In [21]:
y_train = y_train.to_numpy()

In [22]:
y_test = y_test.to_numpy()

In [23]:
train_accuracy, test_accuracy, costs = logistic_regression(X_train, y_train, X_test, y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0
 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1
 1 0 1 1 0 0 0 0 0 1 0]
[0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0
 

In [24]:
print("Training Accuracy: " + str(train_accuracy) + "%")

Training Accuracy: 96.58792650918635%


In [25]:
print("Testing Accuracy: " + str(test_accuracy) + "%")

Testing Accuracy: 97.3404255319149%
