In [13]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [14]:
file_path = r"C:\Users\12489\Documents\Data\cancer.csv"

data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Cell Size,Bare Nuclei,Bland Chromatin,Normal Nuclei,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [15]:
# since this is just an id number, we will drop it

data.drop(['Sample code number'], axis=1, inplace=True)

In [16]:
# make sure it worked

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Clump Thickness           699 non-null    int64 
 1   Uniformity of Cell Size   699 non-null    int64 
 2   Uniformity of Cell Shape  699 non-null    int64 
 3   Marginal Adhesion         699 non-null    int64 
 4   Single Cell Size          699 non-null    int64 
 5   Bare Nuclei               699 non-null    object
 6   Bland Chromatin           699 non-null    int64 
 7   Normal Nuclei             699 non-null    int64 
 8   Mitoses                   699 non-null    int64 
 9   Class                     699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [32]:
data["Class"].unique()

array([2, 4], dtype=int64)

----------------------------

## wrangle the data

In [17]:
# since this is the only column with non-numeric values,
# let's take a closer look...

data["Bare Nuclei"].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [18]:
# before we go any further, let's just be SURE that
# there are not any null values...

data.isna().any()

Clump Thickness             False
Uniformity of Cell Size     False
Uniformity of Cell Shape    False
Marginal Adhesion           False
Single Cell Size            False
Bare Nuclei                 False
Bland Chromatin             False
Normal Nuclei               False
Mitoses                     False
Class                       False
dtype: bool

In [19]:
# great - back to what we were doing...

data.replace('?', np.nan, inplace=True)

# make sure it worked...
data["Bare Nuclei"].unique()

array(['1', '10', '2', '4', '3', '9', '7', nan, '5', '8', '6'],
      dtype=object)

--------------------------------------

## convert to array & use the imputer

In [20]:
imputer = SimpleImputer(strategy='mean')

values = data.values

In [21]:
imputedData = imputer.fit_transform(values)

# great - make sure it worked...
print(imputedData)

--------------------------------

## scale the data

In [23]:
scaler = MinMaxScaler(feature_range=(0, 1))
normalizedData = scaler.fit_transform(imputedData)

In [24]:
print(normalizedData)

[[0.44444444 0.         0.         ... 0.         0.         0.        ]
 [0.44444444 0.33333333 0.33333333 ... 0.11111111 0.         0.        ]
 [0.22222222 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.44444444 1.         1.         ... 1.         0.11111111 1.        ]
 [0.33333333 0.77777778 0.55555556 ... 0.55555556 0.         1.        ]
 [0.33333333 0.77777778 0.77777778 ... 0.33333333 0.         1.        ]]


---------------------------------------

## classify

In [25]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [26]:
# Segregate the features from the labels

X = normalizedData[:,0:9]#input
Y = normalizedData[:,9]#output

In [27]:
kfold = model_selection.KFold(n_splits=10)

cart = DecisionTreeClassifier()

num_trees = 100

model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees)#bagging

results = model_selection.cross_val_score(model, X, Y, cv=kfold)

print(results.mean())

0.962857142857143


In [31]:
# predicting the "class of cancer"...

model.fit(X, Y)

row = [[5, 1, 1, 1, 2, 1, 3, 1, 1]]

yhat = model.predict(row)
yhat

array([1.])

In [28]:
from sklearn.ensemble import AdaBoostClassifier #boosting

num_trees = 100

kfold = model_selection.KFold(n_splits=10)

model = AdaBoostClassifier(n_estimators=num_trees)

results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


0.9585714285714285
