In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
plt.style.use(['seaborn-bright','dark_background'])

In [3]:
data = pd.read_csv('Breast Cancer Prediction.csv')
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [28]:
data.nunique()

Clump Thickness                10
Uniformity of Cell Size        10
Uniformity of Cell Shape       10
Marginal Adhesion              10
Single Epithelial Cell Size    10
Bare Nuclei                    10
Bland Chromatin                10
Normal Nucleoli                10
Mitoses                         9
Class                           2
dtype: int64

In [5]:
data.drop(columns=['Sample code number'], inplace=True)

In [6]:
data.shape

(683, 10)

In [7]:
data['Class'].value_counts()

2    444
4    239
Name: Class, dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Clump Thickness              683 non-null    int64
 1   Uniformity of Cell Size      683 non-null    int64
 2   Uniformity of Cell Shape     683 non-null    int64
 3   Marginal Adhesion            683 non-null    int64
 4   Single Epithelial Cell Size  683 non-null    int64
 5   Bare Nuclei                  683 non-null    int64
 6   Bland Chromatin              683 non-null    int64
 7   Normal Nucleoli              683 non-null    int64
 8   Mitoses                      683 non-null    int64
 9   Class                        683 non-null    int64
dtypes: int64(10)
memory usage: 53.5 KB


In [10]:
X = data.drop(columns=['Class'])
y = data['Class']

In [8]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [11]:
X_sample, y_sample = smote.fit_resample(X, y)

In [12]:
y_sample.value_counts()

2    444
4    444
Name: Class, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=101)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble  import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [16]:
models = []
models.append(("LogisticRegression",LogisticRegression()))
models.append(("DescisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("SupportVector",SVC()))
models.append(("KNeighbors",KNeighborsClassifier()))

In [17]:
for name,model in models:
    model.fit(X_train,y_train)
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    print(name,"train score =",train_score)
    print(name,"test score =",test_score)
    print()

LogisticRegression train score = 0.9774647887323944
LogisticRegression test score = 0.9775280898876404

DescisionTree train score = 1.0
DescisionTree test score = 0.9831460674157303

RandomForest train score = 1.0
RandomForest test score = 0.9887640449438202

SupportVector train score = 0.9774647887323944
SupportVector test score = 0.9775280898876404

KNeighbors train score = 0.9774647887323944
KNeighbors test score = 0.9943820224719101



In [22]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
model.score(X_test,y_test)

0.9943820224719101

In [23]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,prediction))

[[88  0]
 [ 1 89]]


In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           2       0.99      1.00      0.99        88
           4       1.00      0.99      0.99        90

    accuracy                           0.99       178
   macro avg       0.99      0.99      0.99       178
weighted avg       0.99      0.99      0.99       178



In [25]:
import pickle

In [26]:
pickle_out = open("breast_cancer.pkl","wb")
pickle.dump(model,pickle_out)
loaded_model = pickle.load(open("breast_cancer.pkl","rb"))
result = loaded_model.score(X_test,y_test)
print(result)

0.9943820224719101


In [27]:
X_sample.shape

(888, 9)