### Import & Getting Dataset

In [1]:
import lime
import lime.lime_tabular
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from autokeras import StructuredDataClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/dataspelunking/MLwR/master/Machine%20Learning%20with%20R%20(2nd%20Ed.)/Chapter%2003/wisc_bc_data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


### EDA & Feature Engineering

#### Drop NA values

In [6]:
df.dropna(inplace= True)

#### Drop Constant Columns

In [30]:
for col in df.columns:
    if len(set(df[col])) == 1:
        df = df.drop(col,axis = 1) 

In [32]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


#### One-Hot Encoding of Object Columns

In [49]:
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = preprocessing.LabelEncoder().fit_transform(df[col])

In [50]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,0,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,0,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,0,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,0,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,0,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


#### Drop Corellated Columns

Source: https://towardsdatascience.com/are-you-dropping-too-many-correlated-features-d1c96654abe6

In [35]:
def calcDrop(res):
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res['v1'].tolist() + res['v2'].tolist()))
    
    # All unique variables in drop column
    poss_drop = list(set(res['drop'].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))
     
    # Drop any variables in same row as a keep variable
    p = res[ res['v1'].isin(keep)  | res['v2'].isin(keep) ][['v1', 'v2']]
    q = list(set(p['v1'].tolist() + p['v2'].tolist()))
    drop = (list(set(q).difference(set(keep))))

    # Remove drop variables from possible drop 
    poss_drop = list(set(poss_drop).difference(set(drop)))
    
    # subset res dataframe to include possible drop pairs
    m = res[ res['v1'].isin(poss_drop)  | res['v2'].isin(poss_drop) ][['v1', 'v2','drop']]
        
    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m['v1'].isin(drop) & ~m['v2'].isin(drop)]['drop']))
    for item in more_drop:
        drop.append(item)
         
    return drop

In [37]:
def corrX(df, cut = 0.9) :
       
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis = 1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(bool))
    
    dropcols = list()
    
    res = pd.DataFrame(columns=(['v1', 'v2', 'v1.target', 
                                 'v2.target','corr', 'drop' ]))
    
    for row in range(len(up)-1):
        col_idx = row + 1
        for col in range (col_idx, len(up)):
            if(corr_mtx.iloc[row, col] > cut):
                if(avg_corr.iloc[row] > avg_corr.iloc[col]): 
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else: 
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]
                
                s = pd.Series([ corr_mtx.index[row],
                up.columns[col],
                avg_corr[row],
                avg_corr[col],
                up.iloc[row,col],
                drop],
                index = res.columns)
        
                res = res.append(s, ignore_index = True)
    
    dropcols_names = calcDrop(res)
    
    return(dropcols_names)

In [52]:
cor_col = corrX(df,0.8)
df = df.drop(labels=cor_col,axis= 1)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,symmetry_mean,dimension_mean,texture_se,area_se,smoothness_se,concavity_se,points_se,symmetry_se,dimension_se,smoothness_worst,concavity_worst,symmetry_worst,dimension_worst
0,87139402,0,12.32,12.39,0.1959,0.05955,0.6656,17.43,0.008045,0.01683,0.01241,0.01924,0.002248,0.1385,0.1242,0.2827,0.06771
1,8910251,0,10.6,18.95,0.1922,0.06491,1.197,27.1,0.00747,0.03354,0.01365,0.03504,0.003318,0.1213,0.1916,0.294,0.07587
2,905520,0,11.04,16.83,0.1714,0.0634,1.387,13.54,0.005158,0.01056,0.007483,0.01718,0.002198,0.1369,0.1067,0.2998,0.07881
3,868871,0,11.28,13.39,0.1771,0.06072,1.343,26.33,0.01127,0.02187,0.01965,0.0158,0.003442,0.1367,0.08669,0.2102,0.06784
4,9012568,0,15.19,13.21,0.1721,0.05544,0.4125,17.72,0.005012,0.01551,0.009155,0.01647,0.001767,0.1126,0.1362,0.2487,0.06766


#### Drop Unnecessary Columns (Sl No., ID, Date, etc.)

In [55]:
df = df.drop('id',axis= 1)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,symmetry_mean,dimension_mean,texture_se,area_se,smoothness_se,concavity_se,points_se,symmetry_se,dimension_se,smoothness_worst,concavity_worst,symmetry_worst,dimension_worst
0,0,12.32,12.39,0.1959,0.05955,0.6656,17.43,0.008045,0.01683,0.01241,0.01924,0.002248,0.1385,0.1242,0.2827,0.06771
1,0,10.6,18.95,0.1922,0.06491,1.197,27.1,0.00747,0.03354,0.01365,0.03504,0.003318,0.1213,0.1916,0.294,0.07587
2,0,11.04,16.83,0.1714,0.0634,1.387,13.54,0.005158,0.01056,0.007483,0.01718,0.002198,0.1369,0.1067,0.2998,0.07881
3,0,11.28,13.39,0.1771,0.06072,1.343,26.33,0.01127,0.02187,0.01965,0.0158,0.003442,0.1367,0.08669,0.2102,0.06784
4,0,15.19,13.21,0.1721,0.05544,0.4125,17.72,0.005012,0.01551,0.009155,0.01647,0.001767,0.1126,0.1362,0.2487,0.06766


In [57]:
target = 'diagnosis'
Y = df[target]
X = df.drop(target,axis=1)
X.head()

Unnamed: 0,radius_mean,texture_mean,symmetry_mean,dimension_mean,texture_se,area_se,smoothness_se,concavity_se,points_se,symmetry_se,dimension_se,smoothness_worst,concavity_worst,symmetry_worst,dimension_worst
0,12.32,12.39,0.1959,0.05955,0.6656,17.43,0.008045,0.01683,0.01241,0.01924,0.002248,0.1385,0.1242,0.2827,0.06771
1,10.6,18.95,0.1922,0.06491,1.197,27.1,0.00747,0.03354,0.01365,0.03504,0.003318,0.1213,0.1916,0.294,0.07587
2,11.04,16.83,0.1714,0.0634,1.387,13.54,0.005158,0.01056,0.007483,0.01718,0.002198,0.1369,0.1067,0.2998,0.07881
3,11.28,13.39,0.1771,0.06072,1.343,26.33,0.01127,0.02187,0.01965,0.0158,0.003442,0.1367,0.08669,0.2102,0.06784
4,15.19,13.21,0.1721,0.05544,0.4125,17.72,0.005012,0.01551,0.009155,0.01647,0.001767,0.1126,0.1362,0.2487,0.06766
