# Breast tumor classifier model building 

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

## Preprocessing 

In [4]:
# importing and getting a scence of the data 
tumors_data = pd.read_csv('breast_cancer.csv') 
tumors_data.head() 

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
# more detail on each column 
tumors_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

No null values to deal with! All we have to do is to one-hot encode the diagnosis column. 

In [7]:
# encodein 
tumors_data = pd.get_dummies(data= tumors_data, columns=['diagnosis'], drop_first=True, dtype=int) 
tumors_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [8]:
tumors_data['diagnosis_M'].value_counts() / len(tumors_data) * 100

diagnosis_M
0    62.741652
1    37.258348
Name: count, dtype: float64

The benign classe has almost twise as many samples than the other one 

In [10]:
# identifying features and labels 
X = tumors_data.iloc[:, 1:-2] 
y = tumors_data.iloc[:, -1] 

**Note!**  
since the data set length is small we wouldn't be using a validation set. 

In [12]:
# Splitting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
print(f"train:{len(X_train)}, test:{len(X_test)}")

train:455, test:114


In [13]:
# Scaling the data 
from sklearn.preprocessing import StandardScaler 
scalor = StandardScaler()
X_train = scalor.fit_transform(X_train) 
X_test = scalor.transform(X_test) 

## Model building 

In [65]:
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

Here we use grid search to get some insigts about each model's hyperparameter optimize range and a general performance indication using croos validation. We aren't fitting the model to the parameters evaluated by the grid search yet.

In [135]:
logistisc = LogisticRegression()
logistis_param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      'penalty': ['l1', 'l2'],
                       'solver': ['liblinear', 'saga'],
                       'class_weight': [None, 'balanced']}


logistic_search = GridSearchCV(estimator=logistisc, 
                               param_grid=logistis_param_grid,  
                               cv=5,  
                               n_jobs=-1,
                               refit=False) 

logistic_search.fit(X_train, y_train) 

print(logistic_search.best_params_) 
print(logistic_search.best_score_)

{'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
0.9780219780219781


Now that we have a scence of the range of hyper parameters we twist them a little more manually to achieve the heighest score.

In [212]:
log = LogisticRegression(C= 0.1, class_weight=None, penalty= 'l2', solver= 'liblinear') 
scores = cross_val_score(log,
                         X_train,
                         y_train, 
                         cv=5,
                         scoring='accuracy')


print("Mean accuracy:", scores.mean()) 

Mean accuracy: 0.9758241758241759


In [236]:
rf = RandomForestClassifier(max_depth= 20, max_features= 'sqrt', min_samples_split= 5, n_estimators= 200, random_state=42)

scores = cross_val_score(rf,
                         X_train,
                         y_train, 
                         cv=5,
                         scoring='accuracy',)


print("Mean accuracy:", scores.mean())

Mean accuracy: 0.9582417582417582


In [280]:
gb = GradientBoostingClassifier(learning_rate= 0.05,
                                max_depth= 5,
                                max_features= 'sqrt',
                                min_samples_split= 6,
                                n_estimators= 400,
                                subsample= 0.6,
                               random_state=42)

scores = cross_val_score(gb,
                         X_train,
                         y_train, 
                         cv=5,
                         scoring='accuracy')


print("Mean accuracy:", scores.mean())

Mean accuracy: 0.9758241758241759


Above scores are the best I could get. Seeing the result on the testing data: 

In [284]:
logistic_tuned = LogisticRegression(C= 0.1, class_weight=None, penalty= 'l2', solver= 'liblinear') 
logistic_tuned.fit(X_train, y_train) 
yhat_log = logistic_tuned.predict(X_test) 

print("accuracy:", accuracy_score(y_test, yhat_log)) 
print("confusion matrix:\n", confusion_matrix(y_test, yhat_log))

accuracy: 0.9912280701754386
confusion matrix:
 [[71  0]
 [ 1 42]]


In [286]:
rf_tuned = RandomForestClassifier(max_depth= 20, max_features= 'sqrt', min_samples_split= 5, n_estimators= 200, random_state=42)

rf_tuned.fit(X_train, y_train) 
yhat_rf = rf_tuned.predict(X_test) 

print("accuracy:", accuracy_score(y_test, yhat_rf)) 
print("confusion matrix:\n", confusion_matrix(y_test, yhat_rf))

accuracy: 0.9649122807017544
confusion matrix:
 [[70  1]
 [ 3 40]]


In [288]:
gb_tuned = GradientBoostingClassifier(learning_rate= 0.05,
                                max_depth= 5,
                                max_features= 'sqrt',
                                min_samples_split= 6,
                                n_estimators= 400,
                                subsample= 0.6,
                               random_state=42)

gb_tuned.fit(X_train, y_train) 
yhat_gb = gb_tuned.predict(X_test) 

print("accuracy:", accuracy_score(y_test, yhat_gb)) 
print("confusion matrix:\n", confusion_matrix(y_test, yhat_gb))

accuracy: 0.9649122807017544
confusion matrix:
 [[70  1]
 [ 3 40]]


In [290]:
model = logistic_tuned