In [112]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

Dataset contains over 400 breast cancer patients who have undergone surgery for treatment.

source: Kaggle

##### Dataset Column Descriptions
1. Patient_ID: ID of patient
2. Age: Age of patient
3. Gender: Gender of patient
4. Protein1, Protein2, Protein3, Protein4: expression levels
5. Tumor_Stage: Breast cancer stage
6. Histology: Infiltrating Ductal Carcinoma, Infiltration Lobular Carcinoma, Mucinous Carcinoma
7. ER status: Positive or Negative
8. PR status: Positive or Negative
9. HER2 status: Positive or Negative
10. Surgery_type: Lumpectomy, Simple Mastectomy, Modified Radical Mastectomy, Other
11. DateofSurgery: Date of Surgery
12. DateofLast_Visit: Date of last visit
13. Patient_Status: Alive or Dead

#### Goal: Predict the survival of breast cancer patient, so Alive or Dead

In [113]:
data = pd.read_csv("BRCA.csv")

In [114]:
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0  FEMALE  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0  FEMALE -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0  FEMALE  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0  FEMALE  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0  FEMALE  0.221550   1.90680   0.52045 -0.311990   

  Tumour_Stage                      Histology ER status PR status HER2 status  \
0          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
1           II             Mucinous Carcinoma  Positive  Positive    Negative   
2          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
3           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
4           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   

                  Surgery_type Date_of_Surgery Date_of_Last_Visit  \
0  Mo

In [115]:
# looking at if there are any null values
print(data.isnull().sum())

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64


In [116]:
# since the dataset appears to have null values, remove them
data = data.dropna()

In [117]:
# column information 
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float64(5)

In [118]:
# number of males and females in the dataset
print(data.Gender.value_counts())

Gender
FEMALE    313
MALE        4
Name: count, dtype: int64


In [119]:
# Find the tumor stage of the patients 
stage = data["Tumour_Stage"].value_counts()
transactions = stage.index
quantity = stage.values

# creating a pie chart
figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Tumour Stages of Patients")
figure.show()



we see that most of the patients are in Stage 2 

Note: Histology is a descripition of a tumor based on how abnormal the cancer cells and tissue look under a microscope and how quickly the cancer will grow and spread

In [120]:
# checking the histology of breast cancer patients  
histology = data["Histology"].value_counts()
transactions = histology.index
quantity = histology.values
figure = px.pie(data,
                values = quantity,
                names = transactions,
                hole = 0.5,
                title = "Histology of Patients")
figure.show()

In [121]:
# checking ER status 
print(data["ER status"].value_counts())

# checking PR status
print(data["PR status"].value_counts())

# checking HER2 status
print(data["HER2 status"].value_counts())


ER status
Positive    317
Name: count, dtype: int64
PR status
Positive    317
Name: count, dtype: int64
HER2 status
Negative    288
Positive     29
Name: count, dtype: int64


In [122]:
# checking to see the type of surgeries the patients have gone through
surgery = data["Surgery_type"].value_counts()
transactions = surgery.index
quantity = surgery.values

# pie chart
figure = px.pie(data,
                values = quantity,
                names = transactions,
                hole = 0.5,
                title = "Type of Surgery of Patients")
figure.show()

we seem to have a lot categoricla features. so we will transform the values of these features 

In [123]:
data["Tumour_Stage"] = data["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
data["Histology"] = data["Histology"].map({"Infiltrating Ductal Carcinoma": 1, 
                                           "Infiltrating Lobular Carcinoma": 2, "Mucinous Carcinoma": 3})
data["ER status"] = data["ER status"].map({"Positive": 1})
data["PR status"] = data["PR status"].map({"Positive": 1})
data["HER2 status"] = data["HER2 status"].map({"Positive": 1, "Negative": 2})
data["Gender"] = data["Gender"].map({"MALE": 0, "FEMALE": 0})
data["Surgery_type"] = data["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2, 
                                                 "Lumpectomy": 3, "Simple Mastectomy": 4})
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0       0  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0       0 -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0       0  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0       0  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0       0  0.221550   1.90680   0.52045 -0.311990   

   Tumour_Stage  Histology  ER status  PR status  HER2 status  Surgery_type  \
0             3          1          1          1            2             2   
1             2          3          1          1            2             3   
2             3          1          1          1            2             1   
3             2          1          1          1            2             2   
4             2          1          1          1            2             1   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0       15-Jan-17          19-Ju

#### Prediction Model

In [124]:

'''
x = np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 
                   'Tumour_Stage', 'Histology', 'ER status', 'PR status', 
                   'HER2 status', 'Surgery_type']])
y = np.array(data[['Patient_Status']])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
'''

x = np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 
                   'Tumour_Stage', 'Histology', 'ER status', 'PR status', 
                   'HER2 status', 'Surgery_type']])
y = np.array(data['Patient_Status'])  # Ensure y is a 1D array
y = y.ravel()  # Reshape y to a 1D array using ravel()

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)


In [125]:
# creating an instance of the support vector classification model 
model = SVC()

# train the SVC model using training data 
model.fit(xtrain, ytrain)

The model has now learned the patterns and relationships within the training data. It has create a decision boundary that best seperates the different classes in the data based on the specific kernel and hyperparameters. The model can now be used to make predictions on new, unseen data.


features = [['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type']]

In [126]:
# hyperparameter tuning and evaluaion 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix


parameters = {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
grid_search = GridSearchCV(SVC(), parameters)
grid_search.fit(xtrain, ytrain)

# best parameters
best_params = grid_search.best_params_
print("Best parameters: ", best_params)

# best model
best_model = grid_search.best_estimator_

# evaluation
ypred = best_model.predict(xtest)
print("Classification Report:\n", classification_report(ytest, ypred, zero_division=1)) 
print("Confusion Matrix:\n", confusion_matrix(ytest, ypred))


Best parameters:  {'C': 0.1, 'kernel': 'linear'}
Classification Report:
               precision    recall  f1-score   support

       Alive       0.81      1.00      0.90        26
        Dead       1.00      0.00      0.00         6

    accuracy                           0.81        32
   macro avg       0.91      0.50      0.45        32
weighted avg       0.85      0.81      0.73        32

Confusion Matrix:
 [[26  0]
 [ 6  0]]


In [127]:
# prediciton: will print a result of Alive or Dead
features = np.array([[79.0, 0, -0.036229, 0.79551, -0.013525, 1.6299, 2, 3, 1, 1, 1, 2]])
print(model.predict(features))

['Alive']
