In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# READING THE DOWNLOADED DATA SET SAVED IN THE SYSTEM
data_frame = pd.read_csv('~/Desktop/bc.csv')

# DELETING THE UNWANTED COLUMNS
first_col = data_frame.columns[0]
second_col = data_frame.columns[1]
data_frame = data_frame.drop([first_col], axis=1)
data_frame = data_frame.drop([second_col], axis=1)

#ASSIGNING 1 FOR MALIGNANT CLASS AND 0 FOR BENIGN CLASS
data_frame['Class'] = [1 if i=="malignant" else 0 for i in data_frame.Class]

# SEPERATING X AND Y.
# X HAVING ALL DATA OTHER THAN IN CLASS COLUMN. Y HAVING ALL DATA IN CLASS COLUMN
X = data_frame[['Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion', 'Epith.c.size', 'Bare.nuclei', 'Bl.cromatin', 'Normal.nucleoli', 'Mitoses']]
y = data_frame['Class']

#SPLITTING THE TRAINING AND TEST DATA SET.TRAINING SET BEING 70% AND TEST DATA SET IS 30%
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

# CONCATENATE ALL THE COLUMNS FOR TEST AND TRAINING DATA SET
training_data = pd.concat([train_X, train_y], axis=1)
test_data = pd.concat([test_X, test_y], axis=1)
print("Length of training set ", len(train_X))
print("Length of test set ", len(test_X))

# NUMBER OF ROWS WITH BENIGN AS CLASS IS HIGHER COMPARED TO MALIGNANT.SEPARATING BOTH THE CLASSES AND PRINTING THE NUMBER
data_frame_majority = training_data[training_data.Class == 0]
data_frame_minority = training_data[training_data.Class == 1]
print("Training data benign", len(data_frame_majority))
print("Training data malignant", len(data_frame_minority))

#DOWNSAMPLING
df_majority_downsampled = resample(data_frame_majority, replace=False, n_samples=164)
df_downsampled = pd.concat([df_majority_downsampled, data_frame_minority])
df_downsampled_benign = df_downsampled[df_downsampled.Class == 0]
df_downsampled_malignant = df_downsampled[df_downsampled.Class == 1]
print("df_downsampled_benign", len(df_downsampled_benign))
print("df_downsampled_malignant", len(df_downsampled_malignant))

# TRAINING THE MODEL USING TRAINING SET
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(df_downsampled.drop('Class', axis=1), df_downsampled.Class)

y_pred = gnb.predict(test_X)

# CALCULATING THE ACCURACY
from sklearn import metrics 
print("Naive Bayes model accuracy", metrics.accuracy_score(test_y, y_pred)*100)



Length of training set  478
Length of test set  205
Training data benign 314
Training data malignant 164
df_downsampled_benign 164
df_downsampled_malignant 164
Naive Bayes model accuracy 94.6341463414634
