In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import random
from sklearn.model_selection import train_test_split as tts
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [3]:
data = pd.read_csv("/kaggle/input/breast-cancer-dataset/breast-cancer.csv")
data.head(2)

#### Data has 32 columns. id, diagnosis, 10 variables of 3 types (Mean, se, worst) 

# Diagnosis are M = Malignent(Cancer) and B = Benign(No Cancer)

In [4]:
data.diagnosis.unique()

#### Observe no NaN in that column, which means we can use all the data with respect to Diagnosis column

# Assigning M to 1 and B to 0

In [5]:
data.diagnosis = (data.diagnosis == "M") * 1
data.head(2)

In [6]:
data.diagnosis.sum()

# General One Person Data 

In [7]:
print(len(data.iloc[2,:]),len(data))
data.iloc[2,:]

#### Total 569 people went to Diagnosis

In [8]:
sns.histplot(x=data.diagnosis, color = "lime")

# Heat map

In [9]:
plt.figure(figsize = (30,30))
sns.heatmap(data.corr(), cmap = sns.cubehelix_palette(8),annot = True)

#### We can see that radius_mean has stong corrilation with perinmeter_mean/worst, area_mean/worst, radius_worst. So we can remove these 5 variablis.

#### radius_se has strong corrilation with perimeter_se and area_se. So removing those 2

#### Concavity_mean has stron corrilation with concavepoints_mean/worst and conpactness_ mean, concavity_worst. So we remove these 4 variables.

#### Texture_worst and texture_mean are in corrilation. So we can remove one.

In [10]:
data = data.drop(["perimeter_worst", "perimeter_mean", "area_mean", "area_worst", "radius_worst", 
               "compactness_mean", "concave points_mean", "concave points_worst", "concavity_worst", 
               "radius_se", "perimeter_se","texture_worst"], axis = 1)

plt.figure(figsize = (20,20))
sns.heatmap(data.corr(), cmap = sns.cubehelix_palette(8),annot = True)

In [11]:
fig, ax = plt.subplots(nrows = 6, ncols = 3, figsize = (20, 40))
a = 2
for i in range(6):
    for j in range(3):
        sns.histplot(data, x=data.columns[a], element = "poly", ax = ax[i,j], hue = "diagnosis" )
        a = a+1

In [12]:
fig, ax = plt.subplots(nrows = 6, ncols = 3, figsize = (20, 40))
a = 2
for i in range(6):
    for j in range(3):
        sns.scatterplot(x='radius_mean', y=data.columns[a], data=data, hue='diagnosis', ax = ax[i,j])
        a = a+1

In [13]:
for p in data.columns[2:]:
    fig, ax = plt.subplots(nrows = 6, ncols = 3, figsize = (20,40))
    a = 2
    for i in range(6):
        for j in range(3):
            sns.scatterplot(x=data[p], y=data.columns[a], data=data, hue='diagnosis', ax = ax[i,j])
            a = a+1

#### There are certainly some patterns to recognize Breast Cancer, based on the above 2 plots.

# Splitting Data into Train and Test data

In [14]:
Y = data.diagnosis
X = data.drop("diagnosis", axis = 1)

X_train, X_test, Y_train, Y_test = tts(X, Y, random_state = 143, test_size = 0.4)

# Scaling 

In [15]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit_transform(X_train)
scalar.transform(X_test)

# ConfusionMatrix

In [16]:
from sklearn.metrics import mean_absolute_error as mae

# Linear Model

In [17]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)
pred = lr.predict(X_test)

In [18]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# SVM Model

In [19]:
from sklearn.svm import LinearSVC
svc1 = LinearSVC(random_state =2)
svc1.fit(X_train,Y_train)
pred = svc1.predict(X_test)

In [20]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# SVM model 2

In [21]:
from sklearn.svm import SVC
svc2 = SVC(kernel = "rbf", random_state = 2)
svc2.fit(X_train,Y_train)
pred = svc2.predict(X_test)

In [22]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Naive Bayes Model

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,Y_train)
pred = gnb.predict(X_test)

In [24]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Decision Tree Model 1

In [25]:
from sklearn.tree import DecisionTreeClassifier
dtc1 = DecisionTreeClassifier(random_state = 2)
dtc1.fit(X_train,Y_train)
pred = dtc1.predict(X_test)

In [26]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Decision Tree Model 2

In [27]:
from sklearn.tree import DecisionTreeClassifier
dtc2 = DecisionTreeClassifier(random_state = 2,criterion = "entropy")
dtc2.fit(X_train,Y_train)
pred = dtc2.predict(X_test)

In [28]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Random Forest Model 1

In [29]:
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(random_state = 2, n_estimators = 10, criterion = "gini")
rfc1.fit(X_train,Y_train)
pred = rfc1.predict(X_test)

In [30]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Random Forest Model 2

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(random_state = 2, n_estimators = 10, criterion = "entropy")
rfc2.fit(X_train,Y_train)
pred = rfc2.predict(X_test)

In [32]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# Random Forest Model 3

In [33]:
from sklearn.ensemble import RandomForestClassifier
rfc3 = RandomForestClassifier(random_state = 2, n_estimators = 400, criterion = "entropy")
rfc3.fit(X_train,Y_train)
pred = rfc3.predict(X_test)

In [34]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# KNN Model 1

In [35]:
from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(p = 2, n_neighbors = 5)
knn1.fit(X_train,Y_train)
pred = knn1.predict(X_test)

In [36]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

# KNN Model 2

In [37]:
from sklearn.neighbors import KNeighborsClassifier
knn2 = KNeighborsClassifier(p = 2, n_neighbors = 15)
knn2.fit(X_train,Y_train)
pred = knn2.predict(X_test)

In [38]:
pred = [int(round(i,0)) for i in pred]
mae(Y_test,pred) * 100

### The best prediction accuracy was obtainded by Random Forest model with n_estimators = 400(all data), criterion = "entropy". 

##### The accuracy best model on Test set of containg 40% of total data is 95.61%. 