In [2]:

import numpy as np
import pandas as pd
import seaborn as s
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import pickle
import imblearn
# from sklearn.ensemble import forest
plt.rcParams["figure.figsize"]=(12,8)
import plotnine as p9

In [3]:
data = pd.read_csv("data.csv")

In [4]:
data.shape
data.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [5]:
df = data.drop('Unnamed: 32', axis=1) # specify the axis that the name applies to

In [6]:
df.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [7]:
df.diagnosis = df.diagnosis.astype("category")

In [8]:
X = df.drop("diagnosis", axis=1) # just saying which axis again
Y = df["diagnosis"] # this is just a series now

In [9]:
col = X.columns # if we do type(col), it's an Index

### Feature engineering

In [None]:
X.isnull().sum() # this covers every column in the df. 

In [11]:
def rangenorm(x):
    return (x - x.mean())/(x.max() - x.min())

In [12]:
df_norm = X.apply(rangenorm) # this worked.  Apply goes column-wise by default.

In [13]:
df_norm = pd.concat([df_norm, Y], axis = 1) # need to pass dfs as a list, and specify the axis.

### Visualization

In [14]:
### skip for now
X_norm = df_norm.drop("diagnosis", axis=1)
Y_norm = df_norm["diagnosis"]
col = X_norm.columns

In [15]:
le = LabelEncoder()

In [16]:
le.fit(Y_norm)

LabelEncoder()

In [17]:
Y_norm = le.transform(Y_norm) # converts to a numpy nd array of ones and zeroes

In [18]:
Y_norm = pd.DataFrame(Y_norm)

In [64]:
def FitModel(X, Y, algo_name, algorithm, gridSearchParams, cv):
    """Split, take a dict of gridsearch params,"""
    np.random.seed(10)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

    train = pd.concat([y_train, x_train], axis=1)
    train.to_csv("./train.csv", index=False, header=False)
    y_train.to_csv("./Y-train.csv", index=False, header=False)
    
    test = pd.concat([y_test, x_test], axis=1)
    test.to_csv("./test.csv", index=False, header=False)
    y_test.to_csv("./Y-test.csv", index=False, header=False)

    grid = GridSearchCV(
        estimator=algorithm,
        param_grid=gridSearchParams,
        cv=cv,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
    )

    grid_result=grid.fit(x_train, y_train)
    best_params=grid_result.best_params_
    pred = grid_result.predict(x_test)
    cm = confusion_matrix(y_test, pred)

    print(pred)
    pickle.dump(grid_result, open(algo_name+".pkl", "wb"))

    print('Best Params :',best_params)
    print('Classification Report :',classification_report(y_test,pred))
    print('Accuracy Score : ' + str(accuracy_score(y_test,pred)))
    print('Confusion Matrix : \n', cm)


### SVM classifier

In [55]:
# For SVM, put in four values of possible C and 7 for gamma.

In [65]:
param ={
            'C': [0.1, 1, 100, 1000],
            'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        }
FitModel(X_norm,Y_norm,'SVC_norm',SVC(),param,cv=5)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[1 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0
 0 1 1]
Best Params : {'C': 1, 'gamma': 1}
Classification Report :               precision    recall  f1-score   support

           0       1.00      0.96      0.98        75
           1       0.93      1.00      0.96        39

    accuracy                           0.97       114
   macro avg       0.96      0.98      0.97       114
weighted avg       0.98      0.97      0.97       114

Accuracy Score : 0.9736842105263158
Confusion Matrix : 
 [[72  3]
 [ 0 39]]




In [66]:
from imblearn.over_sampling import SMOTE

In [67]:
sm = SMOTE(random_state=42)
X_res , Y_res = sm.fit_resample(X_norm,Y_norm)

In [68]:
X_res.shape

(714, 31)

In [69]:
Y_res.value_counts()
# Y_norm.value_counts()

0    357
1    357
dtype: int64

In [70]:
X_norm.shape

(569, 31)

In [71]:
param ={
            'n_estimators': [100, 500, 1000, 2000],
           
        }
FitModel(X_norm,Y_norm,'XGBoost_norm',XGBClassifier(),param,cv=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0
 0 1 1]
Best Params : {'n_estimators': 500}
Classification Report :               precision    recall  f1-score   support

           0       0.99      0.97      0.98        75
           1       0.95      0.97      0.96        39

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy Score : 0.9736842105263158
Confusion Matrix : 
 [[73  2]
 [ 1 38]]


### Load pickled model in

In [None]:
# from sklearn.externals import joblib

In [72]:
loaded_model = pickle.load(open("XGBoost_norm.pkl","rb"))

In [73]:
pred1 = loaded_model.predict(X_res)

In [74]:
sum([x==y for x,y in zip(pred1, Y_res.squeeze())])/len(pred1)


0.9901960784313726

In [75]:
np.random.seed(10)
x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)

In [76]:
xgbc = XGBClassifier(n_estimators=100)
fit = xgbc.fit(x_train, y_train)





In [77]:
accuracy = fit.score(x_test, y_test)
predict = fit.predict(x_test)
cmatrix = confusion_matrix(y_test, predict)

In [78]:
cmatrix

array([[73,  2],
       [ 1, 38]])

In [79]:
accuracy

0.9736842105263158

In [80]:
importances = xgbc.feature_importances_
indices = np.argsort(importances)[::-1]

In [81]:
for f in range(X.shape[1]):
    print("feature {} ({:.3f})".format(list(X)[f], importances[indices[f]]))

feature id (0.593)
feature radius_mean (0.118)
feature texture_mean (0.067)
feature perimeter_mean (0.037)
feature area_mean (0.023)
feature smoothness_mean (0.019)
feature compactness_mean (0.017)
feature concavity_mean (0.014)
feature concave points_mean (0.013)
feature symmetry_mean (0.010)
feature fractal_dimension_mean (0.010)
feature radius_se (0.009)
feature texture_se (0.008)
feature perimeter_se (0.008)
feature area_se (0.007)
feature smoothness_se (0.007)
feature compactness_se (0.006)
feature concavity_se (0.005)
feature concave points_se (0.004)
feature symmetry_se (0.004)
feature fractal_dimension_se (0.004)
feature radius_worst (0.004)
feature texture_worst (0.003)
feature perimeter_worst (0.003)
feature area_worst (0.002)
feature smoothness_worst (0.002)
feature compactness_worst (0.001)
feature concavity_worst (0.000)
feature concave points_worst (0.000)
feature symmetry_worst (0.000)
feature fractal_dimension_worst (0.000)


In [82]:
feat_imp = pd.DataFrame({"Feature":list(X),
                        "GiniImportance":importances[indices]})

In [83]:
feat_imp.index = feat_imp.Feature

In [84]:
feat_to_keep = feat_imp.iloc[1:15].index # we just want the names

In [85]:
type(feat_to_keep), feat_to_keep

(pandas.core.indexes.base.Index,
 Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
        'smoothness_mean', 'compactness_mean', 'concavity_mean',
        'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
        'radius_se', 'texture_se', 'perimeter_se', 'area_se'],
       dtype='object', name='Feature'))

In [86]:
X_res= pd.DataFrame(X_res)
Y_res = pd.DataFrame(Y_res)
X_res.columns = X_norm.columns

In [87]:
param ={
            'n_estimators': [100, 500, 1000, 2000],
           
        }
FitModel(X_res[feat_to_keep],Y_res,'XGBoost',XGBClassifier(),param,cv=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1
 1 0 0 0 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0
 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 1
 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0]
Best Params : {'n_estimators': 100}
Classification Report :               precision    recall  f1-score   support

           0       0.99      0.97      0.98        68
           1       0.97      0.99      0.98        75

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143

Accuracy Score : 0.9790209790209791
Confusion Matrix : 
 [[66  2]
 [ 1 74]]




In [88]:
loaded_model2 = pickle.load(open("XGBoost.pkl","rb"))
pred2 = loaded_model2.predict(X_res[feat_to_keep])
sum([x==y for x,y in zip(pred2, Y_res.squeeze())])/len(pred2)

0.9957983193277311