In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd

In [None]:
#import dataset
data = pd.read_csv("taysachs.csv")

In [None]:
#Display first 5 rows of the dataset
data.head(5)

In [None]:
#Display total number of rows and columns in the dataset
data.shape

In [None]:
#Statistical description of the data
data.describe()

In [None]:
#Check for null values in the dataset
data.isnull().sum()
data.isna().sum()

In [None]:
#Understand the target varibale further
data["Diagnosis"].value_counts()

#Out of 569 patients, 357 had Benign and 212 had Malignant tumor

In [None]:
#Visualize dataset using Histograms
#A histogram is a plot that lets you discover, and show, the underlying frequency distribution 

import matplotlib.pyplot as plt

num_bins = 10
data.hist(bins=num_bins, figsize=(20,15))
plt.show()

In [None]:
# Assign independent variables (input variables) to X
X = data.iloc[:, 2:31].values

#Assign target variable to Y. Here, target variable is "diagnosis" which is in position 1
Y = data.iloc[:, 1].values

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
Y

In [None]:
#Split the dataset into 75% train and 25% test data

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
#As mentioned above, standardize the features using StandardScaler

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import seaborn as sn

In [None]:
from sklearn.linear_model import LogisticRegression

classifier_1= LogisticRegression(random_state = 0)

#Train the model
classifier_1.fit(X_train, Y_train)

#Test it using the test dataset
Y_pred1 = classifier_1.predict(X_test)

In [None]:
#Model Evaluation

print (accuracy_score(Y_test, Y_pred1))
confusion_matrix = pd.crosstab(Y_test, Y_pred1, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_2.fit(X_train, Y_train)
Y_pred2 = classifier_2.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred2))
confusion_matrix = pd.crosstab(Y_test, Y_pred2, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.svm import SVC
classifier_3 = SVC(kernel = 'linear', random_state = 0)
classifier_3.fit(X_train, Y_train)
Y_pred3 = classifier_3.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred3))
confusion_matrix = pd.crosstab(Y_test, Y_pred3, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.svm import SVC
classifier_4 = SVC(kernel = 'rbf', random_state = 0)
classifier_4.fit(X_train, Y_train)
Y_pred4 = classifier_4.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred4))
confusion_matrix = pd.crosstab(Y_test, Y_pred4, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier_5 = GaussianNB()
classifier_5.fit(X_train, Y_train)
Y_pred5 = classifier_5.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred5))
confusion_matrix = pd.crosstab(Y_test, Y_pred5, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier_6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_6.fit(X_train, Y_train)
Y_pred6 = classifier_6.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred6))
confusion_matrix = pd.crosstab(Y_test, Y_pred6, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_7 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_7.fit(X_train, Y_train)
Y_pred7 = classifier_7.predict(X_test)

In [None]:
print (accuracy_score(Y_test, Y_pred7))
confusion_matrix = pd.crosstab(Y_test, Y_pred7, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)