In [3]:

# importing relevant libraries

import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [10]:
# loading the dataset

data = pd.read_csv("BRCA.csv")
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0  FEMALE  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0  FEMALE -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0  FEMALE  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0  FEMALE  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0  FEMALE  0.221550   1.90680   0.52045 -0.311990   

  Tumour_Stage                      Histology ER status PR status HER2 status  \
0          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
1           II             Mucinous Carcinoma  Positive  Positive    Negative   
2          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
3           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
4           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   

                  Surgery_type Date_of_Surgery Date_of_Last_Visit  \
0  Mo

In [None]:
# checking whether the columns of this dataset contains any null values or not:

print(data.isnull().sum())

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64


In [None]:
#  we drop the null values

data = data.dropna()
print(data.isnull().sum())

Patient_ID            0
Age                   0
Gender                0
Protein1              0
Protein2              0
Protein3              0
Protein4              0
Tumour_Stage          0
Histology             0
ER status             0
PR status             0
HER2 status           0
Surgery_type          0
Date_of_Surgery       0
Date_of_Last_Visit    0
Patient_Status        0
dtype: int64


In [None]:
# insights / information about the data

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float64(5)

In [None]:
# lets explore the statistics of the data

data.describe()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4
count,317.0,317.0,317.0,317.0,317.0
mean,58.725552,-0.027232,0.949557,-0.095104,0.006713
std,12.827374,0.543858,0.906153,0.589027,0.625965
min,29.0,-2.1446,-0.97873,-1.6274,-2.0255
25%,49.0,-0.3506,0.36884,-0.53136,-0.38224
50%,58.0,0.005649,0.99713,-0.19304,0.038522
75%,67.0,0.33626,1.612,0.25121,0.43625
max,90.0,1.5936,3.4022,2.1934,1.6299


In [None]:
# Breast cancer is mostly found in females, so let’s have a look at the Gender column
# to see how many females and males are there:

print(data.Gender.value_counts())

# as expected , more females sufer from breast cancer than males

Gender
FEMALE    313
MALE        4
Name: count, dtype: int64


In [None]:
#  let’s have a look at the stage of tumour of the patients:

stage = data["Tumour_Stage"].value_counts()
transactions = stage.index
quantity = stage.values

figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.2,
             title="Tumour Stages of Patients diagnosed")
figure.show()

# many women/gals suffer from stage  II of cancer diagnosis followed by stage III(late diagnosis)

In [None]:
# Histology
# def -> Histology is a description of a tumour based on how abnormal the cancer cells and tissue look
#  under a microscope and how quickly cancer can grow and spread):

histology = data["Histology"].value_counts()
transactions = histology.index
quantity = histology.values
figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.3,
             title="Histology of Patients")
figure.show()

# mny patients fal under histology I (infiltrating ductal carcinoma followed by labular carcinoma)

In [None]:
# lets explore tha  ER status, PR status, and HER2
# ostrogen receptor/ progesterone receptor /Human Epidermal Growth Factor Receptor 2

# ER status
print(data["ER status"].value_counts())
# PR status
print(data["PR status"].value_counts())
# HER2 status
print(data["HER2 status"].value_counts())

# we now get the number of positive patients per diagnosis
# number of negative = total - positive

ER status
Positive    317
Name: count, dtype: int64
PR status
Positive    317
Name: count, dtype: int64
HER2 status
Negative    288
Positive     29
Name: count, dtype: int64


In [None]:
# lets now explore the type of surgeries to be performed to each patient as per the cancer tumur effect

# Surgery_type
surgery = data["Surgery_type"].value_counts()
transactions = surgery.index
quantity = surgery.values
figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.4,
             title="Type of Surgery of to be administered Patients")
figure.show()

In [None]:
# we now explore the cattegorical values to split our dataset into train and tetx percentage
# treanform data into cattegorical values
data["Tumour_Stage"] = data["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
data["Histology"] = data["Histology"].map({"Infiltrating Ductal Carcinoma": 1,
                                           "Infiltrating Lobular Carcinoma": 2, "Mucinous Carcinoma": 3})
data["ER status"] = data["ER status"].map({"Positive": 1})
data["PR status"] = data["PR status"].map({"Positive": 1})
data["HER2 status"] = data["HER2 status"].map({"Positive": 1, "Negative": 2})
data["Gender"] = data["Gender"].map({"MALE": 0, "FEMALE": 1})
data["Surgery_type"] = data["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2,
                                                 "Lumpectomy": 3, "Simple Mastectomy": 4})
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0     NaN  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0     NaN -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0     NaN  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0     NaN  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0     NaN  0.221550   1.90680   0.52045 -0.311990   

   Tumour_Stage  Histology  ER status  PR status  HER2 status  Surgery_type  \
0           NaN        NaN        NaN        NaN          NaN           NaN   
1           NaN        NaN        NaN        NaN          NaN           NaN   
2           NaN        NaN        NaN        NaN          NaN           NaN   
3           NaN        NaN        NaN        NaN          NaN           NaN   
4           NaN        NaN        NaN        NaN          NaN           NaN   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0       15-Jan-17          19-Ju

In [None]:

# Splitting data
x = np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4','Tumour_Stage', 'Histology', 'ER status', 'PR status','HER2 status', 'Surgery_type']])
y = np.array(data[['Patient_Status']])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)

In [13]:
# RANDOM FOREST ALGORITHM
# libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [15]:
# loading
from sklearn.datasets import load_iris


In [16]:

iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels


In [23]:
# Splitting data
x = np.array(data[['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3','Protein4','Tumour_Stage', 'Histology', 'ER status', 'PR status','HER2 status', 'Surgery_type']])
y = np.array(data[['Patient_Status']])
Xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [20]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)



In [24]:
# Train the classifier on the training data
rf_classifier.fit(Xtrain, ytrain)

ValueError: could not convert string to float: 'MALE'