In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ***Exploratory***

In [2]:
data = pd.read_csv('/kaggle/input/breast-cancer-csv/breast-cancer_csv.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumor-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    int64 
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   Class        286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


# **Data overview**
1. **Age**: Age of the patient at the time of diagnosis.
2. **Menopause status**: Whether the patient has undergone menopause or not.
3. **Tumor size**: The size of the tumor in millimeters.
4. **Number of involved lymph nodes**: The number of lymph nodes that contain cancer cells.
5. **Node caps status**: Whether the cancer cells have spread to the lymph node capsule or not.
6. **Degree of malignancy**: The degree to which the cancer cells differ from normal cells.
7. **Breast quadrant location**: The quadrant of the breast where the tumor is located.
8. **Radiation therapy**: Whether or not the patient received radiation therapy after surgery.

In [4]:
data.describe()

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [5]:
data.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


# ***Preprocessing***

In [6]:
# Removing rows consist of nulls
data = data.dropna()

In [7]:
# mapping 
n_i_mapping = {"yes": 1, "no": 0}
tumor_size_mapping = {'0-4': 2, '5-9': 7, '10-14': 12,'15-19': 17, '20-24': 22,'25-29':27, '30-34':32, '35-39': 37,'40-44': 42,'45-49': 47 ,'50-54': 52}
inv_nodes_mapping = {'0-2': 1, '3-5': 4, '6-8': 7, '9-11': 10, '12-14': 13,'15-17': 16, '24-26': 22}
menopause_mapping = {'ge40':0, 'lt40':1, 'premeno':2}
breast_mapping = {'right':0 , 'left':1}
breast_quad_mapping = {'central':0, 'left_low':1, 'left_up':2, 'right_low':3, 'right_up':4}
age_mapping  = {'20-29': 25, '30-39':35, '40-49':45, '50-59':55, '60-69':65, '70-79':75}

In [8]:
data['tumor-size'] = data['tumor-size'].map(tumor_size_mapping)
data['inv-nodes'] = data['inv-nodes'].map(inv_nodes_mapping)
data['menopause']= data['menopause'].map(menopause_mapping)
data['breast'] = data['breast'].map(breast_mapping)
data['breast-quad'] = data['breast-quad'].map(breast_quad_mapping)
data['age'] = data['age'].map(age_mapping)
data['node-caps'] = data['node-caps'].map(n_i_mapping)
data['irradiat'] = data['irradiat'].map(n_i_mapping)

In [9]:
# mapping classes
Class_mapping = {'no-recurrence-events': 0, 'recurrence-events': 1}
data['Class'] = data['Class'].map(Class_mapping)

In [10]:
# Split the data into features (X) and the target (y)
X = data.drop(columns=['Class'])
y = data['Class']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ***Training***

In [11]:
# Create a Random Forest classifier
clf = RandomForestClassifier(n_estimators=50, random_state=20)
# Train the model
clf.fit(X_train, y_train)

In [12]:
# Logistic Regression
# Train the model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# SVM
# Train the model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [14]:
# Keras model
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7d9c48186c50>

# ***Evaluation***

In [15]:
# Random Forest classifier
# Model Evaluation
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

Accuracy: 87.50%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        42
           1       0.82      0.64      0.72        14

    accuracy                           0.88        56
   macro avg       0.85      0.80      0.82        56
weighted avg       0.87      0.88      0.87        56



In [16]:
# Logistic Regression
# Model Evaluation
logistic_y_pred = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
print("Logistic Regression:")
print(f"Accuracy: {logistic_accuracy * 100:.2f}")
logistic_report = classification_report(y_test, logistic_y_pred)
print("Classification Report:\n", logistic_report)

Logistic Regression:
Accuracy: 71.43
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81        42
           1       0.43      0.43      0.43        14

    accuracy                           0.71        56
   macro avg       0.62      0.62      0.62        56
weighted avg       0.71      0.71      0.71        56



In [17]:
# SVM
# Model Evaluation
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM:")
print(f"Accuracy: {svm_accuracy * 100:.2f}")
svm_report = classification_report(y_test, svm_y_pred)
print("Classification Report:\n", svm_report)

SVM:
Accuracy: 67.86
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78        42
           1       0.38      0.43      0.40        14

    accuracy                           0.68        56
   macro avg       0.59      0.60      0.59        56
weighted avg       0.69      0.68      0.69        56



In [18]:
# Keras model
# Model Evaluation
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
report = classification_report(y_test, y_pred_binary)
print("Classification Report:\n", report)

Test Accuracy: 71.43%
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.88      0.82        42
           1       0.38      0.21      0.27        14

    accuracy                           0.71        56
   macro avg       0.57      0.55      0.55        56
weighted avg       0.67      0.71      0.68        56



# ***Saving Best Model***

In [19]:
# Saving model
joblib.dump(clf, 'Random_Forest.pkl')

['Random_Forest.pkl']

# ***Summary***
> # *| Model | | Accuracy |*

>> **1. | SVM    67.86% |**

>> **2. | Logistics Regression 71.43%|**

>> **3. | Keras Model  75.00% |**

>> **4. | Random Forest 87.50% |**

> # Best Model : Random Forest with accuracy = 87.5%