# 1. Data exploration and Preparation

In [None]:
from pandas import read_csv, get_dummies, Series, DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn import ensemble

from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline    #method 3

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV   #method 3

In [None]:
#Reading the Dataset
data = read_csv('/content/drive/MyDrive/ML_Stats/Breast_Cancer.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

In [None]:
#Encoding
# a. Binary Encoding
data['A Stage'] = data['A Stage'].map({'Regional':1, 'Distant':0})
data['Estrogen Status'] = data['Estrogen Status'].map({'Positive':1, 'Negative':0})
data['Progesterone Status'] = data['Progesterone Status'].map({'Positive':1, 'Negative':0})
data['Status'] = data['Status'].map({'Alive':1, 'Dead':0})

# b. One-hot encoding - Dimentionality reduction
#to avoid overfitting - Curse of Dimentionality
data['T Stage '] = data['T Stage '].map({'T1':1,'T2':2,'T3':3,'T4':4})
data['N Stage'] = data['N Stage'].map({'N1':1,'N2':2,'N3':3})
data['6th Stage'] = data['6th Stage'].map({'IIA':1,'IIIA':2,'IIB':3,'IIIB':4,'IIIC':5})
data['differentiate'] = data['differentiate'].map({'Undifferentiated':1,'Poorly differentiated':2,'Moderately differentiated':3,'Well differentiated':4})
data['Grade'] = data['Grade'].map({'1':1,'2':2,'3':3,' anaplastic; Grade IV':4})
data.info()

#get dummies
#data2 = get_dummies(data, columns = ['Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage', 'differentiate', 'Grade'])
data2 = get_dummies(data, columns = ['Race', 'Marital Status'], dtype=int)
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       4024 non-null   int64
 1   T Stage                   4024 non-null   int64
 2   N Stage                   4024 non-null   int64
 3   6th Stage                 4024 non-null   int64
 4   differentiate             4024 non-null   int64
 5   Grade                     4024 non-null   int64
 6   A Stage                   4024 non-null   int64
 7   Tumor Size                4024 non-null   int64
 8   Estrogen Status           4024 non-null   int64
 9   Progesterone Status       4024 non-null   int64
 10  Regional Node Examined    4024 non-null   int64
 11  Reginol Node Positive     4024 non-null   int64
 12  Survival Months           4024 non-null   int64
 13  Status                    4024 non-null   int64
 14  Race_Black                4024 non-null 

In [None]:
#X & Y - dropping the output
X = data2.drop('Status', axis = 1)
Y = data2['Status']

#Scaling
X_scaled = StandardScaler().fit_transform(X)

#Spliting data
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.3, random_state=100)

#balancing the data - SMOTE
X_train, Y_train = SMOTE (random_state = 100).fit_resample(X_train,Y_train)


# 2. Random Forest Classifier

In [None]:
# Random Forest Classifier (method 2) - to get the best value for Hpyprparameter - n_estimators
RF_classifier2 = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=1)  #building model/classifier
no_trees = {'n_estimators': [50, 100, 200, 250, 300, 350, 400, 450]}
RF_grid_search = GridSearchCV(estimator=RF_classifier2, param_grid=no_trees, scoring='recall', cv=5)
RF_grid_search.fit(X_scaled, Y)   #training, testing, evaluating & ranking

RF_best_parameters = RF_grid_search.best_params_
print(RF_best_parameters)
RF_best_result = RF_grid_search.best_score_
print(RF_best_result)

{'n_estimators': 400}
0.9806352569319744


In [None]:
#n_etimators found - go ahead with Method 1
RF_classifier = RandomForestClassifier(n_estimators=400, criterion='entropy', max_features='sqrt', random_state=1)  #building model/classifier
RF_classifier.fit(X_train, Y_train)        #training
Y_pred_RF = RF_classifier.predict(X_test)  #testing

#evaluation  &  confusion matrix
RF_Accuracy = metrics.accuracy_score(Y_test, Y_pred_RF)
print('Accuracy of Random Forest is :', round(RF_Accuracy,4))

RF_conf_matrix = metrics.confusion_matrix(Y_test, Y_pred_RF)
print("Confusion MAtrix of Random Forest is :")
print(RF_conf_matrix)

RF_recall = metrics.recall_score(Y_test, Y_pred_RF)
print('Recall value for Random Forest is :', round(RF_recall,4))

RF_precision = metrics.precision_score(Y_test, Y_pred_RF)
print('Precisoin value for Random Forest is :', round(RF_precision,4))

RF_f1 = metrics.f1_score(Y_test, Y_pred_RF)
print('f1 score of Random forest is :', round(RF_f1,4))

#important features
RF_imp_features = Series(RF_classifier.feature_importances_, index=list(X)).sort_values(ascending=False)
print('')
print(RF_imp_features)


Accuracy of Random Forest is : 0.8965
Confusion MAtrix of Random Forest is :
[[107  63]
 [ 62 976]]
Recall value for Random Forest is : 0.9403
Precisoin value for Random Forest is : 0.9394
f1 score of Random forest is : 0.9398

Survival Months             0.318714
Reginol Node Positive       0.119434
Age                         0.086549
Tumor Size                  0.082207
Regional Node Examined      0.080173
N Stage                     0.051963
T Stage                     0.045768
6th Stage                   0.045636
Grade                       0.035883
differentiate               0.035553
Progesterone Status         0.021243
Marital Status_Married      0.015999
Marital Status_Single       0.010615
Marital Status_Divorced     0.009711
Race_White                  0.008527
Estrogen Status             0.006949
Race_Other                  0.006844
Race_Black                  0.005941
Marital Status_Widowed      0.005411
Marital Status_Separated    0.004141
A Stage                     0.00

# 3. Support Vector Classification (SVM)

In [None]:
#Initiating with Method 3 - to get values of Hyperparameters: c & kernel
SV_classifier2 = Pipeline([('balancing', SMOTE(random_state = 11)), ('classification', SVC())])  #building model/classifier
kernels_c = {'classification__kernel': ['linear','poly','rbf','sigmoid'], 'classification__C': [.001,.01,.1,1,10,100]}

SV_grid_search = GridSearchCV(estimator = SV_classifier2, param_grid=kernels_c, scoring='recall', cv=5)
SV_grid_search.fit(X_scaled, Y)

SV_best_parameters = SV_grid_search.best_params_
print(SV_best_parameters)
SV_best_result = SV_grid_search.best_score_
print(SV_best_result)

{'classification__C': 0.001, 'classification__kernel': 'poly'}
0.9823947877237632


In [None]:
#building SVM using method 1
#from sklearn.svm import SVC - importing library
SV_classifier = SVC(kernel = 'poly')        # 1.building model/classifier
SV_classifier.fit(X_train, Y_train)       # 2.training
Y_pred_SV = SV_classifier.predict(X_test)  # 3.testing

# 4.evaluation          & Confusion Matrix
Accuracy_SV = metrics.accuracy_score(Y_test, Y_pred_SV)      #Calculating Accuracy
print('Support Vector Accuracy is: ', round(Accuracy_SV,4))

conf_matrix_SV = metrics.confusion_matrix(Y_test, Y_pred_SV)   #Calculating Confusion Matrix
print('Support Vector Confusion Matrix is: ')
print(conf_matrix_SV)

recall_SV = metrics.recall_score(Y_test, Y_pred_SV)         #Calculating Recall
print('Support Vector Recall is: ', round(recall_SV,4))

precision_SV = metrics.precision_score(Y_test, Y_pred_SV)    #Calculating Precision
print('Support Vector Precision is: ', round(precision_SV,4))

f1_SV = metrics.f1_score(Y_test, Y_pred_SV)               #Calculating f1
print('Support Vector f1 is: ', round(f1_SV,4))

Support Vector Accuracy is:  0.8336
Support Vector Confusion Matrix is: 
[[102  68]
 [133 905]]
Support Vector Recall is:  0.8719
Support Vector Precision is:  0.9301
Support Vector f1 is:  0.9
