# Classification prediction using Machine Learning

Python code that uses different Machine Learning algorithms to predict Malignant or Benign classification in the Wisconsin Breast Cancer dataset.

# Load breast cancer Wisconsin dataset

In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')

#load the sample data
df=pd.read_csv('gdrive/My Drive/breast_cancer_wisconsin.csv')
df.count()

# Clean the data to remove any null values

In [None]:
# Remove the last column "Unnamed:32", it does not have any values

#axis = 1 means - Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
df = df.drop('Unnamed: 32', axis=1)

#also drop the id column, as it is not relevant to prediction
df = df.drop('id', axis=1)
df.head()


We will now chack the datatypes of all columns.

We see that all columns are of type float64 except diagnosis. Diagnosis is our result variable, based on all the other float64 parameters. We will convert the object value of diagnosis into 1 and 0.


In [None]:
df.dtypes

In [None]:
# Mapping M to 1 and B to 0 in the output Label DataFrame
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})

# Separate result variable from input data

In [None]:
Y = df['diagnosis']

#also drop the diagnosis column from the parameters, as it is the prediction
df = df.drop('diagnosis', axis=1)

We see all columns are of type float64 except diagnosis. Diagnosis i our result variable, based on all the other float64 parameters.



# Correlation

Correlation is a statistical measure that expresses the extent to which two variables are linearly related (meaning they change together at a constant rate). https://www.jmp.com/en_us/statistics-knowledge-portal/what-is-correlation.html

If two variables are very strongly correlated (values close to either 1 or -1), they do not convey any extra information and should be removed from the dataset.

We can find the correlation between different variables by creating a correlation matrix with all variables.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(numeric_only = True).round(2),  annot=True)

# Understanding the data and correlation matrix

From the above, we see a few things -

Firstly, the following properties are present in the data in three different views - mean, standard error and worst values-
1.   redius
2.   texture
3.   perimeter
4.   area
5.   smoothness
6.   compactness
7.   concavity
8.   concave_points
9.   symmetry
10.  fractal_dimension
We will remove the columns that store worst and standard error.


Secondly, the radius_mean is also closely correlated to -
*   perimeter_mean
*   area_mean
So we will remove perimeter and area from our columns

In [None]:
#Eliminating the highly correlated columns from out dataset

# drop the _worst and _se columns
columns = ['radius_worst',
           'texture_worst',
           'perimeter_worst',
           'area_worst',
           'smoothness_worst',
           'compactness_worst',
           'concavity_worst',
           'concave points_worst',
           'symmetry_worst',
           'fractal_dimension_worst',
           'radius_se',
           'texture_se',
           'perimeter_se',
           'area_se',
           'smoothness_se',
           'compactness_se',
           'concavity_se',
           'concave points_se',
           'symmetry_se',
           'fractal_dimension_se',
           ]
df = df.drop(columns, axis=1)

#drop the perimeter and area columns
columns = ['perimeter_mean',
           'area_mean',
          ]
df = df.drop(columns, axis=1)

# tried dropping concavity and concave points columns, but that gives worse results, so keeping those columns
#columns = ['concavity_se',
#           'concave points_se',
#           'fractal_dimension_se',
#          ]
#df = df.drop(columns, axis=1)

# verify remaining columns
df.columns


In [None]:
# build the correlation matrix again
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(numeric_only = True).round(2),  annot=True)

# Split the data into train and test
Our data is now ready for training. We will split the data set into two pieces — a training set and a testing set.
This consists of random sampling about 75 percent of the rows (can be varied) and putting them into a training set. The remaining 25 percent is put into a test set.

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, first storing diagnosis for the Y-axis
X = df

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=40)

#Let us check the dimensions of or split dataset
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

x_train.head()

# Define a method to print results in desired format

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# print false negatives
def printResults(y_test, y_predict):
  # Calculate confusion matrix
  conf_matrix = confusion_matrix(y_test, y_predict)
  print('Classification Report : \n', classification_report(y_test, y_predict, digits=3), '\n')
  print('----------------------------------------------------------------\n')
  print('confusion matrix : \n', conf_matrix, '\n')
  print('----------------------------------------------------------------\n')
  print('True Negative:', conf_matrix[0][0])
  print('False Positive:', conf_matrix[0][1])
  print('False Negative:', conf_matrix[1][0])
  print('True Positive:', conf_matrix[1][1])

# Scale the data

In [None]:
#Scaling the data(feature scaling)
# This can help to balance the impact of all variables on the distance
# calculation and can help to improve the performance of the algorithm.
# In particular, several ML techniques, such as neural networks, require
# that the input data to be normalized for it to work well.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Initialize and train the Logistic Regression model
lr=LogisticRegression(random_state=16)
lr.fit(x_train, y_train)

y_predict = lr.predict(x_test)
printResults(y_test, y_predict)
print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(lr.score(x_test, y_test)))

# 2. Logistic Regression with stochastic gradient descent

In [None]:
#logistic regression with stochastic gradient decent
from sklearn.linear_model import SGDClassifier

sgd=SGDClassifier()
sgd.fit(x_train, y_train)

#### Now print results
y_predict = sgd.predict(x_test)
printResults(y_test, y_predict)
print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(sgd.score(x_test, y_test)))

# 3. Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

svm=LinearSVC(C=0.01)
svm.fit(x_train, y_train)

#### Now print results
y_predict = svm.predict(x_test)
printResults(y_test, y_predict)
print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(svm.score(x_test, y_test)))

# 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# n_estimators = number of desission trees
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

#### Now print results
y_predict = rf.predict(x_test)
printResults(y_test, y_predict)

print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(rf.score(x_test, y_test)))

# 5. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

#### Now print results
y_predict = clf.predict(x_test)
printResults(y_test, y_predict)

print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(clf.score(x_test, y_test)))

# 6. Boosted Decision Tree

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adb = AdaBoostClassifier()
adb.fit(x_train, y_train)

#### Now print results
y_predict = adb.predict(x_test)
printResults(y_test, y_predict)

print('\n-----------------------------------------------------\n')
print('Accuracy of classifier on test set: {:.2f}'.format(adb.score(x_test, y_test)))

# 7. Neural Network

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout

#creating model
classifier = Sequential()

# Now we create the layers of the neural network:
# Input layer
# Hidden layer
# Output layer

#first hidden layer
classifier.add(Dense(units=9,kernel_initializer='he_uniform',activation='relu',input_dim=8))
#second hidden layer
classifier.add(Dense(units=9,kernel_initializer='he_uniform',activation='relu'))
# last layer or output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',activation='sigmoid'))

#taking summary of layers
classifier.summary()

#compiling the ANN
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

 #fitting the ANN to the training set
model = classifier.fit(x_train,y_train,batch_size=100,epochs=100)

#now testing for Test data
y_predict = classifier.predict(x_test)

y_predict = np.where(y_predict > 0.5, 1, 0)

#print(y_predict)
printResults(y_test, y_predict)