In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as split

#for selecting K best features
from sklearn.feature_selection import SelectKBest,chi2

#for selecting optimal hyperparameyters - hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


from sklearn.tree import DecisionTreeClassifier as decision_tree
from sklearn.ensemble import RandomForestClassifier as random_forest
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier as xgb


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

%matplotlib inline 

In [2]:
cancer = pd.read_csv('cancer.csv')
cancer.head(10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [3]:
cancer.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [4]:
for i in range(len(cancer["diagnosis"])):
    if cancer.iloc[i,1] == 'M':
        cancer.iloc[i,1] = 1
    else:
        cancer.iloc[i,1] = 0        

In [5]:
cancer['diagnosis'].unique()

array([1, 0], dtype=object)

In [6]:
cancer["diagnosis"].describe()

count     569
unique      2
top         0
freq      357
Name: diagnosis, dtype: int64

In [7]:
x = cancer.iloc[:,2:33]
y = cancer.iloc[:,1]

In [8]:
y = y.astype(int)

In [9]:
y

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int32

In [10]:
bestfeatures = SelectKBest(score_func=chi2,k=24)
final = bestfeatures.fit(x,y)

scores = pd.DataFrame(final.scores_)
columns = pd.DataFrame(x.columns)

featurescores = pd.concat([columns,scores],axis=1)
featurescores.columns = ["Features","scores"]
featurescores.nlargest(24,"scores")

Unnamed: 0,Features,scores
23,area_worst,112598.431564
3,area_mean,53991.655924
13,area_se,8758.504705
22,perimeter_worst,3665.035416
2,perimeter_mean,2011.102864
20,radius_worst,491.689157
0,radius_mean,266.104917
12,perimeter_se,250.571896
21,texture_worst,174.4494
1,texture_mean,93.897508


In [11]:
#we will use only 24 features
imp_col = featurescores.nlargest(24,"scores")["Features"].unique()
x_new = x[imp_col]
x_new.head()

Unnamed: 0,area_worst,area_mean,area_se,perimeter_worst,perimeter_mean,radius_worst,radius_mean,perimeter_se,texture_worst,texture_mean,...,concave points_worst,concave points_mean,compactness_mean,symmetry_worst,concavity_se,compactness_se,smoothness_worst,concave points_se,symmetry_mean,fractal_dimension_worst
0,2019.0,1001.0,153.4,184.6,122.8,25.38,17.99,8.589,17.33,10.38,...,0.2654,0.1471,0.2776,0.4601,0.05373,0.04904,0.1622,0.01587,0.2419,0.1189
1,1956.0,1326.0,74.08,158.8,132.9,24.99,20.57,3.398,23.41,17.77,...,0.186,0.07017,0.07864,0.275,0.0186,0.01308,0.1238,0.0134,0.1812,0.08902
2,1709.0,1203.0,94.03,152.5,130.0,23.57,19.69,4.585,25.53,21.25,...,0.243,0.1279,0.1599,0.3613,0.03832,0.04006,0.1444,0.02058,0.2069,0.08758
3,567.7,386.1,27.23,98.87,77.58,14.91,11.42,3.445,26.5,20.38,...,0.2575,0.1052,0.2839,0.6638,0.05661,0.07458,0.2098,0.01867,0.2597,0.173
4,1575.0,1297.0,94.44,152.2,135.1,22.54,20.29,5.438,16.67,14.34,...,0.1625,0.1043,0.1328,0.2364,0.05688,0.02461,0.1374,0.01885,0.1809,0.07678


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_new[imp_col] = scaler.fit_transform(x_new[imp_col]) 
x_new.head().T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)


Unnamed: 0,0,1,2,3,4
area_worst,2.001237,1.890489,1.456285,-0.550021,1.220724
area_mean,0.984375,1.908708,1.558884,-0.764464,1.826229
area_se,2.487578,0.742402,1.181336,-0.288378,1.190357
perimeter_worst,2.303601,1.535126,1.347475,-0.249939,1.338539
perimeter_mean,1.269934,1.685955,1.566503,-0.592687,1.776573
radius_worst,1.88669,1.805927,1.51187,-0.281464,1.298575
radius_mean,1.097064,1.829821,1.579888,-0.768909,1.750297
perimeter_se,2.833031,0.263327,0.850928,0.286593,1.273189
texture_worst,-1.359293,-0.369203,-0.023974,0.133984,-1.46677
texture_mean,-2.073335,-0.353632,0.456187,0.253732,-1.151816


In [13]:
X_train,X_test,y_train,y_test = split(x_new,y,test_size=0.2)

X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

## Decision Tree

In [14]:
#hyper parameter search 
from scipy.stats import randint

param_dt = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

tree = decision_tree()
tree_cv = RandomizedSearchCV(estimator = tree, param_distributions = param_dt, n_iter = 100, cv = 9, verbose=2, random_state=42, n_jobs = -1)
tree_cv = tree_cv.fit(X_train,y_train)

print(tree_cv.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 9 folds for each of 100 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 560 tasks      | elapsed:    5.3s


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    5.9s finished


In [24]:
tree = decision_tree(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

tree = tree.fit(X_train,y_train)
pred = tree.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.9473684210526315
[[70  2]
 [ 4 38]]


##  Random Forest

In [16]:
#hyperparameter search
from scipy.stats import randint

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

param_rf = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'min_samples_split': randint(1,10),
               'min_samples_leaf': randint(1,5),
               'bootstrap': [True, False],
               'criterion':['gini','entropy']}

rf = random_forest()
rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = param_rf, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_cv = rf_cv.fit(X_train,y_train)

print(rf_cv.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.5min finished


RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=80, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=311,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [25]:
rf = random_forest(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=80, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=311,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

rf.fit(X_train,y_train)
pred = rf.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.9824561403508771
[[71  1]
 [ 1 41]]


## Naive Bayes

In [18]:
nb = GaussianNB()
nb = nb.fit(X_train,y_train)

pred = nb.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.956140350877193
[[70  2]
 [ 3 39]]


## AdaBoost

In [19]:
#hyperparameter search

param_ada = {'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
             'learning_rate' : [0.04,0.045,0.05,0.055,0.06]}

ada = AdaBoostClassifier()
ada_cv = RandomizedSearchCV(estimator = ada, param_distributions = param_ada, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
ada_cv = ada_cv.fit(X_train,y_train)

print(ada_cv.best_estimator_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.5min finished


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.04,
                   n_estimators=1155, random_state=None)


In [26]:
ada = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.04,
                   n_estimators=1155, random_state=None)

ada = ada.fit(X_train,y_train)

pred = ada.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.9736842105263158
[[71  1]
 [ 2 40]]


## XG Boost 

In [21]:
from scipy import stats
param_xgb = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.6),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.9),
              'min_child_weight': [1, 2, 3, 4]}

XGB = xgb()
XGB_cv = RandomizedSearchCV(estimator = XGB, param_distributions = param_xgb, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
XGB_cv = XGB_cv.fit(X_train,y_train)

print(XGB_cv.best_estimator_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 285 out of 300 | elapsed:    9.6s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    9.7s finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7123864277740601, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.16364099365679438, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=356, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.6847869165226947,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [28]:
XGB = xgb(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7123864277740601, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.16364099365679438, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=356, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.6847869165226947,
              tree_method='exact', validate_parameters=1, verbosity=None)

XGB = XGB.fit(X_train,y_train)
pred = XGB.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.9824561403508771
[[70  2]
 [ 0 42]]


In [29]:
import pickle as p
p.dump(rf,open('cancer.pickle','wb'))