In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
data = pd.read_csv('breastcancer_train.csv')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              509 non-null    int64  
 1   diagnosis       509 non-null    object 
 2   radius_m        509 non-null    float64
 3   texture_m       509 non-null    float64
 4   perimeter_m     509 non-null    float64
 5   area_m          509 non-null    float64
 6   smoothness_m    509 non-null    float64
 7   compactness_m   509 non-null    float64
 8   concavity_m     509 non-null    float64
 9   concavepts_m    509 non-null    float64
 10  symmetry_m      509 non-null    float64
 11  fractaldim_m    509 non-null    float64
 12  radius_se       509 non-null    float64
 13  texture_se      509 non-null    float64
 14  perimeter_se    509 non-null    float64
 15  area_se         509 non-null    float64
 16  smoothness_se   509 non-null    float64
 17  compactness_se  509 non-null    flo

In [16]:
data.describe()

Unnamed: 0,id,radius_m,texture_m,perimeter_m,area_m,smoothness_m,compactness_m,concavity_m,concavepts_m,symmetry_m,...,radius_w,texture_w,perimeter_w,area_w,smoothness_w,compactness_w,concavity_w,concavepts_w,symmetry_w,fractaldim_w
count,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,...,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0
mean,32427960.0,14.081481,19.371218,91.689823,650.564833,0.096454,0.105161,0.089034,0.048862,0.181682,...,16.20656,25.714715,106.895403,874.445776,0.132412,0.255503,0.272176,0.114368,0.29022,0.084254
std,131709200.0,3.531773,4.429189,24.37682,347.811936,0.014078,0.05345,0.080323,0.039058,0.02812,...,4.868071,6.274501,33.899624,569.430461,0.022981,0.155693,0.208138,0.065803,0.061672,0.017591
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.02344,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.6,16.17,74.33,409.7,0.08641,0.06602,0.02987,0.02031,0.162,...,12.84,21.08,83.51,507.2,0.1166,0.1472,0.1164,0.06493,0.2504,0.07191
50%,905686.0,13.3,18.91,86.18,546.3,0.09594,0.09263,0.06126,0.03275,0.1792,...,14.91,25.41,97.19,683.4,0.1312,0.2167,0.2249,0.09804,0.2827,0.08009
75%,8813129.0,15.78,21.88,104.7,788.5,0.1054,0.1305,0.1307,0.07364,0.1964,...,18.81,30.15,125.9,1088.0,0.1465,0.3416,0.3853,0.1607,0.3179,0.09221
max,911320500.0,27.42,39.28,186.9,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,0.9379,1.252,0.291,0.6638,0.173


In [37]:
from sklearn.preprocessing import LabelEncoder

def loadpreprocess(file='breastcancer_train.csv'):
    data = pd.read_csv(file)

    # separate features and target
    Xin = data
    yin = data.pop('diagnosis')
    
    # encode target
    le = LabelEncoder()
    yin = le.fit_transform(yin)
    
    return Xin, yin

### Baseline Model

In [77]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

Xin, yin = loadpreprocess(file='breastcancer_train.csv')

score=[]
# split into folds, preserving diagnosis proportions in splits
skf = StratifiedKFold(n_splits=10)
for trainidx, validx in skf.split(Xin, yin):
    X = Xin.loc[trainidx,:].copy()
    y = yin[trainidx].copy()
    
    xval = Xin.loc[validx,:].copy()
    yval = yin[validx].copy()
    
    # fit classifier and score
    rfc = RandomForestClassifier(random_state=123)
    rfc.fit(X,y)
    score.append(rfc.score(xval, yval))
    
print(f'Accuracy of each fold:\n {score}')
print(f'Baseline mean accuracy:\n {np.mean(score)}')
print(f'Baseline accuracy variance:\n {np.var(score)}')


Accuracy of each fold:
 [0.9803921568627451, 0.8627450980392157, 0.9215686274509803, 0.9607843137254902, 0.9803921568627451, 0.9607843137254902, 0.9607843137254902, 0.9803921568627451, 0.9411764705882353, 1.0]
Baseline mean accuracy:
 0.9549019607843137
Baseline accuracy variance:
 0.00138792772010765


In [78]:
# feature importance

feat = rfc.feature_names_in_
impor = rfc.feature_importances_
featimpor = pd.Series(rfc.feature_importances_, index=rfc.feature_names_in_).sort_values(ascending=False)

for num in range(len(featimpor)):
    print(f'{featimpor.index[num]}: {featimpor[num]}')

perimeter_w: 0.145831012148896
radius_w: 0.14016035176979266
concavepts_w: 0.10473167216933098
concavepts_m: 0.10388063052706795
area_w: 0.08075849881765362
perimeter_m: 0.05893855553009969
concavity_w: 0.044334582814905606
radius_m: 0.04118812896065432
area_m: 0.040237804041240874
area_se: 0.03668641163160762
concavity_m: 0.031566641097172186
radius_se: 0.017169205782818362
texture_w: 0.017038622767037576
texture_m: 0.01661031786335454
compactness_w: 0.015628069730014266
compactness_m: 0.01389550639141883
smoothness_w: 0.011696356039187757
symmetry_w: 0.011511074310718471
perimeter_se: 0.00885172846298659
fractaldim_w: 0.007490266929382887
smoothness_m: 0.007003978467488062
id: 0.006805973215668072
concavity_se: 0.006463493859837418
symmetry_se: 0.005834062088570754
fractaldim_se: 0.004383692985012545
compactness_se: 0.004085481412301531
symmetry_m: 0.003847689933669552
fractaldim_m: 0.0037637514818282397
concavepts_se: 0.003385516669807094
texture_se: 0.003259055689094634
smoothness_

In [90]:
# drop uninportant features

def dropfeat(df):
    unimportant = featimpor[featimpor<=featimpor['id']].index
    df = df.drop(unimportant, axis=1)
    return df

featimpor[featimpor<=featimpor['id']].index

Index(['id', 'concavity_se', 'symmetry_se', 'fractaldim_se', 'compactness_se',
       'symmetry_m', 'fractaldim_m', 'concavepts_se', 'texture_se',
       'smoothness_se'],
      dtype='object')

In [88]:
# create new features

def addfeat(df):
    new = pd.DataFrame()
    new['perim_wxconpts_m'] = df['perimeter_w']*df['concavepts_m']
    new['perim_w-m'] = df['perimeter_w']-df['perimeter_m']
    #new['radius_wxconpts_w'] = df['radius_w']*df['concavepts_w']
    #new['radius_w-m'] = df['radius_w']-df['radius_m']
    #new['concavepts_w-m'] = df['concavepts_w']-df['concavepts_m']
    #new['area_w-m'] = df['area_w']-df['area_m']
    #new['perimxradius_w'] = df['radius_w']*df['perimeter_w']
    #new['textxradius_w'] = df['radius_w']*df['texture_w']

    return new


In [89]:
# test new features

Xin, yin = loadpreprocess(file='breastcancer_train.csv')

# remove features
Xin=dropfeat(Xin)
#print(Xin.shape)
# add new features
Xin = Xin.join(addfeat(Xin))  
#print(Xin.shape)

score=[]
# split into folds, preserving diagnosis proportions in splits
skf = StratifiedKFold(n_splits=10)
for trainidx, validx in skf.split(Xin, yin):
    
    X = Xin.loc[trainidx,:].copy()
    y = yin[trainidx].copy()
    
    xval = Xin.loc[validx,:].copy()
    yval = yin[validx].copy()

    # fit classifier and score
    rfc = RandomForestClassifier(random_state=123)
    rfc.fit(X,y)
    score.append(rfc.score(xval, yval))
    
print(f'Accuracy of each fold:\n {score}')
print(f'Mean accuracy:\n {np.mean(score)}')
print(f'Accuracy variance:\n {np.var(score)}')


Accuracy of each fold:
 [0.9803921568627451, 0.9019607843137255, 0.9411764705882353, 0.9803921568627451, 1.0, 0.9803921568627451, 0.9803921568627451, 0.9803921568627451, 0.9803921568627451, 1.0]
Mean accuracy:
 0.972549019607843
Accuracy variance:
 0.0007843137254901955


In [81]:
# optimize model

from sklearn.model_selection import GridSearchCV

Xin, yin = loadpreprocess(file='breastcancer_train.csv')
# remove features
Xin=dropfeat(Xin)
# add new features
Xin = Xin.join(addfeat(Xin)) 

rfc = RandomForestClassifier(random_state=123) 
paramgrid = dict(n_estimators=[75,100,150],
                 max_depth=[4,6,8,10],
                 criterion=['gini','entropy'],
                 max_features=['sqrt','log2'])

skf = StratifiedKFold(n_splits=10)

opt = GridSearchCV(estimator=rfc, param_grid=paramgrid, cv=skf)
opt.fit(Xin, yin)

opt.best_params_


{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'n_estimators': 100}

### Final Model

In [74]:
Xin, yin = loadpreprocess(file='breastcancer_train.csv')

# remove features
Xin=dropfeat(Xin)
# add new features
Xin = Xin.join(addfeat(Xin))  

score=[]
# split into folds, preserving diagnosis proportions in splits
skf = StratifiedKFold(n_splits=10)
for trainidx, validx in skf.split(Xin, yin):
    
    X = Xin.loc[trainidx,:].copy()
    y = yin[trainidx].copy()
    
    xval = Xin.loc[validx,:].copy()
    yval = yin[validx].copy()
    
    # fit classifier and score
    rfc = RandomForestClassifier(random_state=123,
                                criterion='gini',
                                max_depth=8,
                                max_features='sqrt',
                                n_estimators=100)
    rfc.fit(X,y)
    score.append(rfc.score(xval, yval))
    
print(f'Accuracy of each fold:\n {score}')
print(f'Final model mean accuracy:\n {np.mean(score)}')
print(f'Final model accuracy variance:\n {np.var(score)}')


Accuracy of each fold:
 [0.9803921568627451, 0.9019607843137255, 0.9411764705882353, 0.9803921568627451, 1.0, 0.9803921568627451, 0.9803921568627451, 0.9803921568627451, 0.9803921568627451, 1.0]
Final model mean accuracy:
 0.972549019607843
Final model accuracy variance:
 0.0007843137254901955


### Test set performance

In [80]:
# fit on training set
Xin, yin = loadpreprocess(file='breastcancer_train.csv')

# remove features
Xin=dropfeat(Xin)
# add new features
Xin = Xin.join(addfeat(Xin))

# fit classifier
rfc = RandomForestClassifier(random_state=123,
                            criterion='gini',
                            max_depth=8,
                            max_features='sqrt',
                            n_estimators=100)
rfc.fit(Xin,yin)

# transform and predict on test set
Xtest, ytest = loadpreprocess(file='breastcancer_test.csv')

# remove features
Xtest=dropfeat(Xtest)
# add new features
Xtest = Xtest.join(addfeat(Xtest))

# score on test set
score = rfc.score(Xtest, ytest)
    
print(f'Final model accuracy on test set:\n {score}')

Final model accuracy on test set:
 0.9833333333333333
