<a href="https://colab.research.google.com/github/agarwal-peeush/Learning/blob/PadhAI_learning/Learning-Python/padhAI/MPNeuron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading dataset

In [0]:
import sklearn.datasets
import numpy as np

In [0]:
breast_cancer = sklearn.datasets.load_breast_cancer()

In [0]:
X = breast_cancer.data
Y = breast_cancer.target

In [0]:
print(X)
print(Y)

In [0]:
print(X.shape, Y.shape)

In [0]:
import pandas as pd

In [0]:
data = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

In [0]:
data['class'] = breast_cancer.target

In [0]:
data.head()

In [0]:
data.describe()

In [0]:
data['class'].value_counts()

In [0]:
print(breast_cancer.target_names)

In [0]:
data.groupby('class').mean()

#Train test split

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = data.drop('class', axis=1)
Y = data['class']

In [0]:
type(X)

In [0]:
type(Y)

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [0]:
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [0]:
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

In [0]:
print(Y.mean(), Y_train.mean(), Y_test.mean()) #ratio of 1 is not maintained. 

In [0]:
#To maintain the ratio of 1 in train and test, use following
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y)

In [0]:
print(Y.mean(), Y_train.mean(), Y_test.mean())

In [0]:
print(X.mean(), X_train.mean(), X_test.mean())

If we run the train_test_split again and again, it gives different results. But we want consistent result anytime we run this.

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state = 1)

In [0]:
print(X.mean(), X_train.mean(), X_test.mean())

#Binarisation of input

In [0]:
import matplotlib.pyplot as plt

In [0]:
plt.plot(X_train.T, '*')
plt.xticks(rotation='vertical')
plt.show()

In [0]:
#Binarise using mean
X_binarised_3_train = X_train['mean area'].map(lambda x: 0 if x < 1000 else 1)

In [0]:
plt.plot(X_binarised_3_train, '*')

In [0]:
# we don't want to binarise manually instead use library to do it
# By looking at means of different features when grouped by class, mean with 
#  class='0' was on higher side than class='1' so we'll flip the labels for 
#  train and test
X_binarised_train = X_train.apply(pd.cut, bins=2, labels=[1,0])

In [0]:
plt.plot(X_binarised_train.T, '*')
plt.xticks(rotation='vertical')
plt.show()

In [0]:
X_binarised_train.head()

In [0]:
# repeat for X_test
X_binarised_test = X_test.apply(pd.cut, bins=2, labels=[1,0])

In [0]:
type(X_binarised_test)

In [0]:
# convert into numpy arrays
X_binarised_train = X_binarised_train.values
X_binarised_test = X_binarised_test.values

In [0]:
type(X_binarised_train)

#MP Neuron model

In [0]:
from random import randint

In [0]:
b = 3

i = randint(0,len(Y_train))

print('For row ', i)
if(np.sum(X_binarised_train[i,])>=b):
  print('MP Neuron inference is malignant')
else:
  print('MP Neuron inference is benign')

if(Y_train[i] == 1):
  print('Ground truth is malignant')
else:
  print('Ground truth is benign')


In [0]:
b = 3

Y_pred_train = []
accurate_rows = 0
for x, y in zip(X_binarised_train, Y_train):
  y_pred = (np.sum(x) >= b)
  Y_pred_train.append(y_pred)
  accurate_rows += (y == y_pred)

print('Accurate rows: ', accurate_rows)
print('Accuracy: ', accurate_rows/len(Y_train))

In [0]:
# Run for all possible values of b
for b in range(X_binarised_train.shape[1]+1):
  Y_pred_train = []
  accurate_rows = 0
  for x, y in zip(X_binarised_train, Y_train):
    y_pred = (np.sum(x) >= b)
    Y_pred_train.append(y_pred)
    accurate_rows += (y == y_pred)

  print('b: {0}, Accuracy: {1}'.format(b, accurate_rows/len(Y_train)))

After flipping labels in binarisation step, it looks like at b=28 we have high accuracy of ~85% 

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
b = 28

Y_pred_test = []

for x in X_binarised_test:
  y_pred = (np.sum(x) >= b)
  Y_pred_test.append(y_pred)

accuracy = accuracy_score(Y_pred_test, Y_test)
print('b: {0}, Accuracy: {1}'.format(b, accuracy))

On train data, it was ~85%, but on test data it is ~79%