In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

import torch

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-diagnostic-dataset/brca.csv")

# Examining dataset

In [None]:
df.head()

In [None]:
df = df.iloc[:, 1:]

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
def histplots(df, colored=False):
    k = 0
    cols = df.columns[:-1]
    n = 5
    m = 6
    fig, axes = plt.subplots(nrows=n, ncols=m, figsize=(15, 10))
    for i in range(n):
        for j in range(m):
            if colored:
                sns.histplot(df, x=df[df.columns[k]], ax=axes[i][j], kde=True, hue=df.columns[-1])
            else:
                sns.histplot(df, x=df[df.columns[k]], ax=axes[i][j], kde=True, color='g')
            k += 1

    plt.tight_layout()
    plt.show()

In [None]:
def boxplots(df):
    k = 0
    cols = df.columns[:-1]
    n = 5
    m = 6
    fig, axes = plt.subplots(nrows=n, ncols=m, figsize=(15, 10))
    for i in range(n):
        for j in range(m):
            sns.boxplot(df, x=df[df.columns[k]], ax=axes[i][j])
            k += 1

    plt.tight_layout()
    plt.show()

# Detecting outliers on boxplots and histograms

In [None]:
boxplots(df)
histplots(df, False)
histplots(df, True)

In [None]:
def outliers_removal(df, i):
    perc = np.percentile(df[i], [0, 25, 50, 75, 100])
    iqr = perc[3] - perc[1]
    _min = perc[1] - 1.5*iqr
    _max = perc[3] + 1.5*iqr
    df.loc[df[i] > _max, i] = _max
    df.loc[df[i] < _min, i] = _min
    return df

In [None]:
for i in df.columns[:-1]:
    df = outliers_removal(df, i)

# Boxplots and histograms after removing outliers

In [None]:
boxplots(df)
histplots(df, False)
histplots(df, True)

# Slight class imbalance

In [None]:
tumors = df[df.columns[-1]].value_counts()
plt.pie(tumors,
       labels=tumors.index,
       autopct='%0.2f%%')

In [None]:
le = LabelEncoder()
target = df.columns[-1]
df[target] = le.fit_transform(df[target])

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

# Applying SMOTE technique

In [None]:
sm = SMOTE()
print("Before ", Counter(y_train))
x_train, y_train = sm.fit_resample(x_train, y_train)
print("After ", Counter(y_train))

# Training and evluation pipeline

In [None]:
def evaluate(model, name, _r=2):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    acc = acc*100
    acc = round(acc, _r)
    title = "{} : {}%".format(name, acc)
    print(title)
    print()

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    evaluate(model, name)
    return model

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
svc = SVC(C=0.5, kernel='linear')
abc = AdaBoostClassifier(learning_rate=0.1)
rfc = RandomForestClassifier(max_depth=70, criterion='gini')
lgr = LogisticRegression(max_iter=1000)
gnb = GaussianNB()
dtc = DecisionTreeClassifier(criterion='gini', max_depth=50)
xgb = XGBClassifier()
models = [knn, svc, abc, rfc, lgr, gnb, dtc, xgb]
names = ['KNN', 'SVC', 'Ada Boost', 'Random Forest', 
         'Logistic Regression', 'Naive Bayes',
        'Decision Tree', 'XGBoost']

# ML Algorithms benchmarks

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [training(i, j)]