In [1]:
# simple function to print version string for a library
def v(lib):
    try:
        version = lib.__version__
    except:
        version = "---"
    name = lib.__name__
    print("  {:20} ({})".format(name, version))
    
#import all the libraries we will need... 
print("importing libraries...")
import pandas as pd
v(pd)
import os
v(os)
import numpy as np
v(np)
import sweetviz as sv
v(sv)
import imblearn
v(imblearn)
print('done')


importing libraries...
  pandas               (1.0.3)
  os                   (---)
  numpy                (1.17.2)
  sweetviz             (1.0beta6)
  imblearn             (0.7.0)
done


In [2]:
def setup_environment_variables():
    # set csw and home_dir variables
    cwd = os.getcwd()
    print('cwd: {}'.format(cwd))
    home_dir = os.path.expanduser("~")
    print('home_dir: {}'.format(home_dir))

    repo_name = 'breast-cancer'
    repo_dir = os.path.join(home_dir, repo_name)
    if os.path.isdir(repo_dir):
        print('repo_dir: {}'.format(repo_dir))
    else: 
        print('oops! repository is not under home_dir ("{}"") or is not named "{}"'.format(home_dir, repo_name))
        
    data_dir = os.path.join(repo_dir, 'data')
    if os.path.isdir(data_dir):
        print('data_dir: {}'.format(data_dir))
    else: 
        print('oops! directory named "data" not found under "{}"'.format(data_dir, repo_name))
        
    return cwd, home_dir, repo_dir, data_dir


In [3]:
def create_initial_cancer_dataset():
    # open the  cancer data file
    cancer_df = pd.read_csv(os.path.join(home_dir, data_dir, "cancer_data.csv"))
    
    # convert 'diagnosis' column to a categorical
    cancer_df['diagnosis'] = pd.Categorical(cancer_df['diagnosis'], cancer_categories, ordered=True).codes
    cancer_df = cancer_df.drop(columns=['id'])
    
    return cancer_df

In [4]:
# initalize global environment variables ... 
cwd, home_dir, repo_dir, data_dir = setup_environment_variables()

# setup 'cancer_categories' to be used to convert 'B' and 'M' into categorical (numeric) values
cancer_categories = ['B', 'M']
# remeber the indices for B and M (for use in other functions, etc)
B = cancer_categories.index('B')
M = cancer_categories.index('M')

#initialize cancer_df from the raw data file
cancer_df = create_initial_cancer_dataset()

# print(M, B, cancer_categories)

cwd: /Users/smcclellan/breast-cancer/notebooks
home_dir: /Users/smcclellan
repo_dir: /Users/smcclellan/breast-cancer
data_dir: /Users/smcclellan/breast-cancer/data


In [5]:
cancer_df.dtypes

diagnosis                     int8
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst

In [6]:
cancer_df.describe()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
cancer_df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
def create_imbalanced_dataset(df, over_balance_on, N=100, verbose=False):
    
    # replicate the starting datafram (df) N times into df2
    if verbose:
        print('replicating base dataframe {} times'.format(N))
    df2 = pd.concat([df for ii in range(N)])

    if verbose:
        print('original dataframe: {} rows, new/temp dataframe: {} rows\n'.format(len(df), len(df2)))
          
    # assuming (for now) that we are blancing relative to a 'diagnosis' (that is binary classification: 0 or 1)
    # validate the the 'over_balance_on' parm .. 
    if over_balance_on == 0:
        minority = 1
    elif over_balance_on == 1: 
        minority = 0
    else:
        print("ERROR: over_balance_on has to be 0 or 1 (binary classificaion only)!")
        return
    # print("valid 'over_balance_on' parameter specifed... ")
          
    majority = over_balance_on
    # minority_st = cancer_categories[minority]
    over_balance_on_st = cancer_categories[over_balance_on]

    print('creating a new dataframe imbalanced on ''diagnosis=="{}"'' ({})'.format(over_balance_on_st, over_balance_on))
          
    # create a new dataframe 'majority_df' by selecting rowes where 'diagnosis==majority' from the 
    # temporary dataframe (which was replicated Nx from the base_df)
    majority_df  = df2.query('diagnosis=={}'.format(majority)) 
    majority_rows = len(majority_df)   
    # print('... {} rows - contianing ''diagnosis=="{}"'' only'.format(majority_rows, over_balance_on_st))
    # print('... added to {} total rose - containg a mix of ''diagnosis''\n'.format(len(df)))
    
    imbalanced_df = df.append(majority_df)
    return imbalanced_df

In [9]:
def print_balance_stats(a):
    b_rows = len(a.query('diagnosis=={}'.format(B)))
    m_rows = len(a.query('diagnosis=={}'.format(M)))
    t_rows = len(a)
    if m_rows > b_rows:
        print("dataframe is over balanced toward '{}' ({}%)".format(cancer_categories[M], m_rows/t_rows))
    elif b_rows > m_rows:
        print("dataframe is over balanced toward '{}' ({}%)".format(cancer_categories[M], b_rows/t_rows))
    else:
        print("the datafram is balanced!")
    
    print("B: {}, B: {}, total: {}  ({})".format(b_rows, m_rows, t_rows, (m_rows+b_rows)==t_rows))
    return b_rows, m_rows, t_rows

In [14]:
def balance_dataset(df, verbose=False):
    
    # pass 'balance_dataset' a dataframe that should ideally be imbalanced and 'balance_dataset'
    # will apply Synthetic Minority Over-sampling Technique (aka: SMOTE) to reblance the data 
    # 
    # the reblancing technique involves breaking the dataframe into
    #     y    a 'target_vector' which is essentially the 'diagnosis' colum from 'df'
    #     X    the features matrix which is essentally all the remaining columns in the matrix

    if verbose:
        print("initial balance statistics (before reblancing)")
        print_balance_state(df)
        
    # separate the feature matrix (X) from the 'target vector' (y)
    # WARNING: code below assumes that the 'diagnosis', it the first column () in the datfram
    # should re-write it to work regardless of column order... 
    y = df.iloc[:,0].values
    X = df.iloc[:,1:].values
    
    # apply Synthetic Minority Over-sampling Technique (aka: SMOTE) to reblance the data 
    # (creating a 50/50 ratio of malignant and benign cases)
    
    # note: SMOTE will return "resampled" versions of X and y that have additional entries created
    # to achieve balance
    oversample = imblearn.over_sampling.SMOTE()
    X_resamp, y_resamp = oversample.fit_resample(X, y)
     
    # reassemble the dataframe into 'rebalanced_df' (which will be returned from the function)
    
    # build a list of column names 
    column_names = list(cancer_df.columns)
    if verbose:
        print(column_names)

    # reassemble the dataframe from X_reasmp and y_resamp
    rebalanced_df = pd.DataFrame(np.insert(X_resamp, 0, y_resamp, axis=1), columns=column_names)

    if verbose:
        print("There sould be an equal number of 'benign' and 'malignant' cases after rebalancing... ")
        print("benign:", len(rebalanced_df.query("diagnosis=={}".format(B))))
        print("malignant:", len(rebalanced_df.query("diagnosis=={}".format(M))))
    
    return rebalanced_df

In [15]:
malignant_imbalanced = create_imbalanced_dataset(cancer_df, M)
print('malignant_imbalanced: should have M >> B')
_, _, _ = print_balance_stats(malignant_imbalanced)

print('\nrebalanced_df: should have M == B')
rebalanced_df = balance_dataset(malignant_imbalanced)
_, _, _ = print_balance_stats(rebalanced_df)

benign_imbalanced = create_imbalanced_dataset(cancer_df, M)
print('\nmalignant_imbalanced: should have B >> M')
_, _, _ = print_balance_stats(benign_imbalanced)

print('\nrebalanced_df: should have B == M')
rebalanced_df = balance_dataset(malignant_imbalanced)
_, _, _ = print_balance_stats(rebalanced_df)

creating a new dataframe imbalanced on diagnosis=="M" (1)
malignant_imbalanced: should have M >> B
dataframe is over balanced toward 'M' (0.9836005328678395%)
B: 357, B: 21412, total: 21769  (True)

rebalanced_df: should have M == B
the datafram is balanced!
B: 21412, B: 21412, total: 42824  (True)
creating a new dataframe imbalanced on diagnosis=="M" (1)

malignant_imbalanced: should have B >> M
dataframe is over balanced toward 'M' (0.9836005328678395%)
B: 357, B: 21412, total: 21769  (True)

rebalanced_df: should have B == M
the datafram is balanced!
B: 21412, B: 21412, total: 42824  (True)


In [None]:
# replicate cancer data N times into cancer_df2
N = 100 
print('replicating cancer data ("cancer_df") {} times into {}'.format(N, 'cancer_df2'))
cancer_df2 = pd.concat([cancer_df for ii in range(N)])

print('cancer_df: {} rows'.format(len(cancer_df)))
print('cancer_df2: {} rows'.format(len(cancer_df2)))

In [None]:
# create two new dataframes 'cancer_benign' and 'cancer_malignant' split on 'diagnosis'
# Old method (produces a warning): cancer_benign_df = cancer_df2[cancer_df['diagnosis']==B].copy()
cancer_benign_df = cancer_df2.query('diagnosis=={}'.format(B)) 
b_rows = len(cancer_benign_df)
print('cancer_benign_df: {} rows'.format(b_rows))

# Old methos (produces a warning): cancer_malignant_df = cancer_df2[cancer_df['diagnosis']==M].copy()
cancer_malignant_df = cancer_df2.query('diagnosis=={}'.format(M)) 
m_rows = len(cancer_malignant_df)
print('cancer_malignant_df: {} rows'.format(m_rows))

t_rows = len(cancer_df2)
print('total rows after split is correct:', t_rows == (b_rows+m_rows))

print()
print('benign %: {}'.format((b_rows/t_rows)*100))
print('malignant %: {}'.format((m_rows/t_rows)*100))
print('ratio (malignant/benign): {}'.format(m_rows/b_rows))

In [None]:
# create a (severely) imbalanced datasetwhere there are many more (xN) benign cases than malignant cases 
# ... this is done by appending (xN) benign cases (cancer_benign) on the original data (cancer_df)
cancer_imbalanced_benign_df = cancer_df.append(cancer_benign_df)

In [None]:
cancer_imbalanced_benign_df.shape

In [None]:
print("benign:", len(cancer_imbalanced_benign_df.query("diagnosis=={}".format(B))))
print("malignant:", len(cancer_imbalanced_benign_df.query("diagnosis=={}".format(M))))

In [None]:
# separate the feature matrix (X) from the 'target vector' (y)
# WARNING: code below assumes that the 'diagnosis', it the first column () in the datfram
# should re-write it to work regardless of column order... 
y = cancer_imbalance_benign_df.iloc[:,0].values
X = cancer_imbalance_benign_df.iloc[:,1:].values

In [None]:
# apply Synthetic Minority Over-sampling Technique (aka: SMOTE) to reblance the data (creating a 50/50 ratio of 
# malignant and benign cases

oversample = imblearn.over_sampling.SMOTE()
X_resamp, y_resamp = oversample.fit_resample(X, y)

In [None]:
# build a list of column names 
column_names = list(cancer_imbalance_benign_df.columns)
# print(column_names)

# reassemble the dataframe from X_reasmp and y_resamp
rebalanced_df = pd.DataFrame(X_resamp,columns=column_names[1:])
rebalanced_df['diagnosis'] = y_resamp

print("Note: 'diagnosis column has now the last column in the re-balanced dataframe.'")
print("\nThere sould be an equal number of 'benign' and 'malignant' cases after rebalancing... ")
print("benign:", len(rebalanced_df.query("diagnosis=={}".format(B))))
print("malignant:", len(rebalanced_df.query("diagnosis=={}".format(M))))

In [None]:
# create an EDA report for the original datafram (scaled by N)
data_report = sv.analyze(cancer_df2)
data_report.show_html('cancer_df2.html')

In [None]:
cancer_df2['diagnosis'].value_counts()

In [None]:
# create an EDA report for the severely imbalanced dataframe (swith N*benign sample appended to the original df)
data_report = sv.analyze(cancer_imbalance_benign_df)
data_report.show_html('cancer_imbalance_benign_df.html')


In [None]:
data_report = sv.analyze(balanced_pd)
data_report.show_html('balanced_pd.html')

In [None]:
malignant_only_df = balanced_pd.query('diagnosis==1')