## Imports

In [15]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

## Loading Data

In [16]:
df=pd.read_csv("Data\\breast_cancer_wisconsin_data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

## Processing Data

In [18]:
#Dropping the 'id' column from the dataset
df=df.drop("id",1)
df=df.drop("Unnamed: 32",1)
    
#Mapping M to 1 and B to 0 in the output Label DataFrame
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})
    
    
#Split Data into training and test (65% and 34%)
train, test = train_test_split(df, test_size = 0.34, random_state=1)

#Training Data
train_x=train.loc[:,'radius_mean' : 'fractal_dimension_worst']
train_y=train.loc[:,['diagnosis']]
    
#Testing Data
test_x=test.loc[:,'radius_mean' : 'fractal_dimension_worst']
test_y=test.loc[:,['diagnosis']]
    
#Converting Traing and Test Data to numpy array
train_x=np.asarray(train_x)
train_y=np.asarray(train_y)
test_x=np.asarray(test_x)
test_y=np.asarray(test_y)

## Helper Functions

In [19]:
#Function to initialize the weights and bias
def initialize(m):
    
    w = np.zeros((m,1))
    b = 0
    
    return w , b
    
#Function to calculate sigmoid of x    
def sigmoid(X):
    return 1/(1 + np.exp(- X))    


#Function for doing forward and back propogation
def propogate(X, Y, w, b):
    
    m = X.shape[1] #Number of training examples

    #Forward Propogation, calculating the cost
    Z = np.dot(w.T, X) + b;    
    A = sigmoid(Z)
    cost= -(1/m) * np.sum(Y * np.log(A) + (1-Y) * np.log(1-A))
    
    #Back Propogation , calculating the gradients
    dw = (1/m)* np.dot(X, (A-Y).T)
    db = (1/m)* np.sum(A-Y)
    
    grads= {"dw" : dw, "db" : db}
    
    return grads, cost
    
    


#Function for performing Grdient Descent
def optimize(X, Y, w, b, num_of_iterations, alpha):
    
    costs=[] 
    
    for i in range(num_of_iterations):
 
        grads, cost = propogate(X, Y, w, b)
        
        dw = grads["dw"]
        db = grads["db"]
        
        w = w - alpha * dw
        b = b - alpha * db
        
        #Storing the cost at interval of every 10 iterations
        if i% 10 == 0:
            costs.append(cost)
            print("cost after %i iteration : %f" % (i, cost))
            
            
    parameters = {"w":w, "b":b}
    grads = {"dw":dw, "db":db}
    
    
    return parameters, grads, costs


#Function for doing the predictions on the data set (mapping probabilities to 0 or 1)
def predict(X, w, b):
    
    m = X.shape[1] #Number of training examples
    
    y_prediction =  np.zeros((1,m))
    
    w = w.reshape(X.shape[0], 1)
    
    A=sigmoid(np.dot(w.T, X)+b)
    
    
    for i in range(A.shape[1]):
        
        if(A[0,i]<0.5):
            y_prediction[0,i]=0
        else:
            y_prediction[0,i]=1
            
    
    return y_prediction

## Logistic Regression Model

In [20]:
#Function for calculating the Logistic Regression Model
def model(Xtrain, Ytrain, num_of_iterations, alpha):
    
    dim = Xtrain.shape[0] #Number of features
    
    w,b = initialize(dim)
    
    parameters, grads, costs = optimize(Xtrain, Ytrain, w, b, num_of_iterations, alpha) 
    
    w = parameters["w"]
    b = parameters["b"]
        
    
    d={"w":w, "b":b, "costs": costs}
    
    return d

## Accuracy

In [21]:
def accuracy():           
    #Calling the model function to train a Logistic Regression Model on Training Data
    d= model(train_x.T, train_y.T, num_of_iterations=5000, alpha=0.000001)
    
        
    costs=d["costs"]
    w=d["w"]
    b=d["b"]
    
    
    #Now, calculating the accuracy on Training and Test Data
    Y_prediction_train = predict(train_x.T, w, b)
    Y_prediction_test = predict(test_x.T, w, b)
    
    
    
    print("\nTrain accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - train_y.T)) * 100))
    
    print("\nTest accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - test_y.T)) * 100))

#Calling the accuracy function to output the results
accuracy()

cost after 0 iteration : 0.693147
cost after 10 iteration : 0.667932
cost after 20 iteration : 0.661586
cost after 30 iteration : 0.655525
cost after 40 iteration : 0.649726
cost after 50 iteration : 0.644170
cost after 60 iteration : 0.638839
cost after 70 iteration : 0.633717
cost after 80 iteration : 0.628790
cost after 90 iteration : 0.624044
cost after 100 iteration : 0.619467
cost after 110 iteration : 0.615048
cost after 120 iteration : 0.610776
cost after 130 iteration : 0.606644
cost after 140 iteration : 0.602642
cost after 150 iteration : 0.598763
cost after 160 iteration : 0.595000
cost after 170 iteration : 0.591346
cost after 180 iteration : 0.587796
cost after 190 iteration : 0.584344
cost after 200 iteration : 0.580985
cost after 210 iteration : 0.577714
cost after 220 iteration : 0.574528
cost after 230 iteration : 0.571422
cost after 240 iteration : 0.568392
cost after 250 iteration : 0.565436
cost after 260 iteration : 0.562549
cost after 270 iteration : 0.559729
cos

cost after 2430 iteration : 0.358080
cost after 2440 iteration : 0.357705
cost after 2450 iteration : 0.357332
cost after 2460 iteration : 0.356960
cost after 2470 iteration : 0.356591
cost after 2480 iteration : 0.356223
cost after 2490 iteration : 0.355857
cost after 2500 iteration : 0.355492
cost after 2510 iteration : 0.355130
cost after 2520 iteration : 0.354769
cost after 2530 iteration : 0.354410
cost after 2540 iteration : 0.354052
cost after 2550 iteration : 0.353696
cost after 2560 iteration : 0.353342
cost after 2570 iteration : 0.352990
cost after 2580 iteration : 0.352639
cost after 2590 iteration : 0.352289
cost after 2600 iteration : 0.351942
cost after 2610 iteration : 0.351596
cost after 2620 iteration : 0.351251
cost after 2630 iteration : 0.350908
cost after 2640 iteration : 0.350567
cost after 2650 iteration : 0.350227
cost after 2660 iteration : 0.349889
cost after 2670 iteration : 0.349552
cost after 2680 iteration : 0.349217
cost after 2690 iteration : 0.348884
c


Train accuracy: 91.2 %

Test accuracy: 92.26804123711341 %
