In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn import preprocessing
from tqdm import tqdm
from time import time

In [2]:
df = pd.read_csv("../data/dataset1.csv")
df=df.drop('id',axis=1)
df['diagnosis']=df['diagnosis'].map({'M': 1, 'B': 0})
mean = np.mean(df, axis=0)
for i in range(1,df.shape[1]):
    df.iloc[:, i].fillna(mean[i-1], inplace=True)

In [3]:
y = df['diagnosis']
X = df.drop(['diagnosis'], axis=1)

In [4]:
X = df.drop(['diagnosis'], axis=1)
split_idx = int(len(df) * 0.67)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [5]:
X_train=X_train.to_numpy()
X_test=X_test.to_numpy()
X_train=X_train.T
X_test=X_test.T

In [6]:
def sigmoid(a):
    return (1/(1+np.exp(-a)))
def calc_cost(m,Y,A):
    return -(1/m)*np.sum(Y*(np.log(A))+(1-Y)*(np.log(1-A)))
def calc_weights(w,X):
    A = sigmoid(np.dot(w.T,X))
    return A
def update_weights(m,A,X,Y):
    dw=(1/m)*np.dot(X,(A-Y).T)
    return dw
def gradient_descent(w, X, Y, num_iterations, learning_rate):
    costs=[]
    m=X.shape[1]
    for i in range(num_iterations):
        A=calc_weights(w,X)
        cost=calc_cost(m,Y,A)
        dw=update_weights(m,A,X,Y)
        w=w-learning_rate*dw
        costs.append(cost)
    return w,dw,costs
def prediction(w, X, Y):
    m=X.shape[1]
    print(Y)
    predict=np.zeros((1,m))
    A=np.dot(w.T,X)
    Y_prediction = np.zeros((1,m))
    
    count=0
    for i in range(1,m):
        if A[0][i]<=0.5:
            Y_prediction[0][i]=0
        else:
            Y_prediction[0][i]=1
            
    for i in range(1,m):
        if Y_prediction[0][i]==Y[i]:
            count+=1
        
    return (count/X.shape[1])*100

In [7]:
def logistic_regression(X_train, Y_train, X_test, Y_test, learning_rate=0.005,num_iterations=1000):
    w = np.zeros([X_train.shape[0],1])
    w, dw, costs = gradient_descent(w,X_train, Y_train, num_iterations, learning_rate)
    train_accuracy = prediction(w, X_train, Y_train)
    test_accuracy = prediction(w, X_test, Y_test)
    return train_accuracy,test_accuracy,costs

In [8]:
y_train=y_train.to_numpy()
y_test=y_test.to_numpy()

In [9]:
train_accuracy,test_accuracy,costs=logistic_regression(X_train, y_train, X_test, y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0
 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1
 1 0 1 1 0 0 0 0 0 1 0]
[0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0
 

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  


In [10]:
print('The Accuracy in percentage calculated on the Training Data is '+str(train_accuracy))

The Accuracy in percentage calculated on the Training Data is 91.33858267716536


In [11]:
print('The Accuracy in percentage calculated on the Test Data is '+str(test_accuracy))

The Accuracy in percentage calculated on the Test Data is 90.42553191489363
