In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
columns = ["Id", "clump_thickness", "uniformity_of_cellsize", "uniformity_of_cellshape", "marginal_adhesion", "single_epithelial_cs", "bare_nuclei", "bland_chromatin", "normal_nucleili", "mmitoses", "class"]
df = pd.read_csv("breast-cancer-wisconsin.data", names=columns, index_col="Id")

In [3]:
df

Unnamed: 0_level_0,clump_thickness,uniformity_of_cellsize,uniformity_of_cellshape,marginal_adhesion,single_epithelial_cs,bare_nuclei,bland_chromatin,normal_nucleili,mmitoses,class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,2
841769,2,1,1,1,2,1,1,1,1,2
888820,5,10,10,3,7,3,8,10,2,4
897471,4,8,6,4,3,4,10,6,1,4


In [20]:
df.dtypes

clump_thickness             int64
uniformity_of_cellsize      int64
uniformity_of_cellshape     int64
marginal_adhesion           int64
single_epithelial_cs        int64
bare_nuclei                object
bland_chromatin             int64
normal_nucleili             int64
mmitoses                    int64
class                       int64
dtype: object

In [19]:
df.isnull().any()

clump_thickness            False
uniformity_of_cellsize     False
uniformity_of_cellshape    False
marginal_adhesion          False
single_epithelial_cs       False
bare_nuclei                 True
bland_chromatin            False
normal_nucleili            False
mmitoses                   False
class                      False
dtype: bool

# Exploratory Data analysis

In [6]:
df["class"].value_counts()

2    458
4    241
Name: class, dtype: int64

In [15]:
df["bare_nuclei"].replace("?", np.nan, inplace=True)

In [16]:
df["bare_nuclei"].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

In [17]:
corr = df.corr()
px.imshow(corr)

In [18]:
print(corr["class"].sort_values(ascending=False))

class                      1.000000
uniformity_of_cellshape    0.818934
uniformity_of_cellsize     0.817904
bland_chromatin            0.756616
clump_thickness            0.716001
normal_nucleili            0.712244
marginal_adhesion          0.696800
single_epithelial_cs       0.682785
mmitoses                   0.423170
Name: class, dtype: float64


In [12]:
ucshape_by_outcome = df.groupby("class")["uniformity_of_cellshape"].mean().reset_index()
px.bar(ucshape_by_outcome, x="class", y="uniformity_of_cellshape", title="cell shape uniformity by outcome")

In [13]:
ucshape_by_outcome = df.groupby("class")["uniformity_of_cellsize"].mean().reset_index()
px.bar(ucshape_by_outcome, x="class", y="uniformity_of_cellsize", title="cell size uniformity by outcome")

In [14]:
ucshape_by_outcome = df.groupby("class")["bland_chromatin"].mean().reset_index()
px.bar(ucshape_by_outcome, x="class", y="bland_chromatin", title="bland chromatin by outcome")

In [21]:
ucshape_by_outcome = df.groupby("class")["clump_thickness"].mean().reset_index()
px.bar(ucshape_by_outcome, x="class", y="clump_thickness", title="clump thickess by outcome")

In [22]:
ucshape_by_outcome = df.groupby("class")["normal_nucleili"].mean().reset_index()
px.bar(ucshape_by_outcome, x="class", y="normal_nucleili", title="normal_nucleili by outcome")

# Data Imputation

In [23]:
df["bare_nuclei"].fillna(method="bfill", inplace=True)

In [24]:
df["bare_nuclei"].isnull().sum()

0

# Outlier removal

In [25]:
Q1 = df.quantile(0.1)

In [26]:
Q3 = df.quantile(0.9)

In [27]:
IQR = Q3 - Q1

In [28]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [29]:
df.shape

(668, 10)

# spllitting Data

In [30]:
target = df["class"]
del df["class"]

In [31]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=50) 

In [32]:
print(f"x_train:{x_train.shape}")
print(f"y_train:{y_train.shape}")
print(f"x_test:{x_test.shape}")
print(f"y_test:{y_test.shape}")

x_train:(534, 9)
y_train:(534,)
x_test:(134, 9)
y_test:(134,)


# modelling

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
y_pred = model.predict(x_test)

In [36]:
y_pred

array([2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2,
       2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2, 2, 2,
       2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4, 2, 2, 4,
       4, 4, 4, 4, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2,
       2, 2, 2, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2, 4,
       2, 4], dtype=int64)

In [38]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           2       0.99      0.96      0.98        85
           4       0.94      0.98      0.96        49

    accuracy                           0.97       134
   macro avg       0.96      0.97      0.97       134
weighted avg       0.97      0.97      0.97       134



In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)


array([[82,  3],
       [ 1, 48]], dtype=int64)