In [2]:
import pandas as pd
df = pd.read_csv("data/wdbc.data")
df.head()

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [22]:
#holistic summary statistics of the dataset
summary_stats = df.describe().drop(index = 'count', columns='ID')
summary_stats.to_csv('results/summary_stats.csv')


#Summary Statistics: Key comparison between Benign and Malignant

rmeanM1 = df[df['Diagnosis']=='M']['radius1'].mean()
rmeanM2 = df[df['Diagnosis']=='M']['radius2'].mean()
rmeanM3 = df[df['Diagnosis']=='M']['radius3'].mean()
rmeanB1 = df[df['Diagnosis']=='B']['radius1'].mean()
rmeanB2 = df[df['Diagnosis']=='B']['radius2'].mean()
rmeanB3 = df[df['Diagnosis']=='B']['radius3'].mean()

Bcell_mean_radius = (rmeanB1+rmeanB2+rmeanB3)/3
Mcell_mean_radius = (rmeanM1+rmeanM2+rmeanM3)/3
print('Average Radius of a Benign Tumor Cell: ',Bcell_mean_radius)
print('Average Radius of a Malignant Tumor Cell: ',Mcell_mean_radius, '\n')

cmeanM1 = df[df['Diagnosis']=='M']['compactness1'].mean()
cmeanM2 = df[df['Diagnosis']=='M']['compactness2'].mean()
cmeanM3 = df[df['Diagnosis']=='M']['compactness3'].mean()
cmeanB1 = df[df['Diagnosis']=='B']['compactness1'].mean()
cmeanB2 = df[df['Diagnosis']=='B']['compactness2'].mean()
cmeanB3 = df[df['Diagnosis']=='B']['compactness3'].mean()

Bcell_mean_compactness = (cmeanB1+cmeanB2+cmeanB3)/3
Mcell_mean_compactness = (cmeanM1+cmeanM2+cmeanM3)/3
print('Average Compactness of a Benign Tumor Cell: ',Bcell_mean_compactness)
print('Average Compactness of a Malignant Tumor Cell: ',Mcell_mean_compactness,'\n')

cvmeanM1 = df[df['Diagnosis']=='M']['concavity1'].mean()
cvmeanM2 = df[df['Diagnosis']=='M']['concavity2'].mean()
cvmeanM3 = df[df['Diagnosis']=='M']['concavity3'].mean()
cvmeanB1 = df[df['Diagnosis']=='B']['concavity1'].mean()
cvmeanB2 = df[df['Diagnosis']=='B']['concavity2'].mean()
cvmeanB3 = df[df['Diagnosis']=='B']['concavity3'].mean()

Bcell_mean_concavity = (cvmeanB1+cvmeanB2+cvmeanB3)/3
Mcell_mean_concavity = (cvmeanM1+cvmeanM2+cvmeanM3)/3
print('Average Concavity of a Benign Tumor Cell: ',Bcell_mean_concavity)
print('Average Concavity of a Malignant Tumor Cell: ',Mcell_mean_concavity)

Average Radius of a Benign Tumor Cell:  8.603469094304389
Average Radius of a Malignant Tumor Cell:  13.068908018867925 

Average Compactness of a Benign Tumor Cell:  0.09473180578898227
Average Compactness of a Malignant Tumor Cell:  0.18409768396226414 

Average Concavity of a Benign Tumor Cell:  0.07943069309056956
Average Concavity of a Malignant Tumor Cell:  0.2177347641509434


In [4]:
#Classification report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df.drop(['ID','Diagnosis'], axis=1)
y = df['Diagnosis'].map({'M':1, 'B':0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
classification_report = classification_report(y_test,y_pred)
print(classification_report)

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        72
           1       0.90      0.90      0.90        42

    accuracy                           0.93       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.93      0.93      0.93       114



In [5]:
#Key Classification Features for Breast Cancer Diagnosis
f_importance = pd.Series(classifier.feature_importances_,index = X.columns)

In [43]:
#plots

import matplotlib.pyplot as plt
f_importance.nlargest(10).plot(kind='barh')
plt.title('Main Predictor Features of Breast Cancer')
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.savefig('results/feature_importance.png')
plt.close()

statistic_compare_plt = pd.DataFrame({
    "Benign":[8.603469094304389,0.09473180578898227,0.07943069309056956],
    "Malignant":[13.068908018867925, 0.18409768396226414, 0.2177347641509434]},
    index=["Radius", "Compactness", "Concavity"])

statistic_compare_plt.plot(kind="bar",figsize=(10, 5))
plt.title("Comparison between Tumor Type and Corresponding Features")
plt.xlabel("Tumor Cell Features")
plt.ylabel("Relative Units for each feature")
plt.tight_layout()
plt.savefig('results/BM_comparison.png')
plt.close()
