In [None]:
# simple function to print version string for a library
def v(lib):
    try:
        version = lib.__version__
    except:
        version = "---"
    name = lib.__name__
    print("  {:20} ({})".format(name, version))
    
#import all the libraries we will need... 
print("importing libraries...")
import pandas as pd
v(pd)
import os
v(os)
import numpy as np
v(np)
import sweetviz as sv
v(sv)
import imblearn
v(imblearn)
print('done')


In [None]:
def setup_environment_variables():
    # set csw and home_dir variables
    cwd = os.getcwd()
    print('cwd: {}'.format(cwd))
    home_dir = os.path.expanduser("~")
    print('home_dir: {}'.format(home_dir))

    repo_name = 'breast-cancer'
    repo_dir = os.path.join(home_dir, repo_name)
    if os.path.isdir(repo_dir):
        print('repo_dir: {}'.format(repo_dir))
    else: 
        print('ERROR: repository is not under home_dir ("{}"") or is not named "{}"'.format(home_dir, repo_name))
        return
        
    data_dir = os.path.join(repo_dir, 'data')
    if os.path.isdir(data_dir):
        print('data_dir: {}'.format(data_dir))
    else: 
        print('oops! directory named "data" not found under "{}"'.format(repo_dir))
        data_dir = os.path.join(repo_dir, 'data')
    
    eda_dir = os.path.join(repo_dir, 'EDA')
    if os.path.isdir(eda_dir):
        print('eda_dir: {}'.format(eda_dir))
    else: 
        print('directory named "EDA" not found under "{}"'.format(repo_dir))
        print('creating "EDA" dir... "{}"'.format(eda_dir))
        os.makedirs(eda_dir)
        
    part_dir = os.path.join(repo_dir, 'part-files')
    if os.path.isdir(part_dir):
        print('part_dir: {}'.format(part_dir))
    else: 
        print('directory named "part-files" not found under "{}"'.format(repo_dir))
        print('creating "part-files" dir... "{}"'.format(part_dir))
        os.makedirs(part_dir)
        
    return cwd, home_dir, repo_dir, data_dir, eda_dir, part_dir


In [None]:
def now():
    from datetime import datetime
    return datetime.now().strftime("%d%m%Y-%H:%M:%S")
    
def name_df(df, name, desc=""):
    from datetime import date
    if desc =="":
        df.name = "".join((name,"-",now()))
    else:
        df.name = "".join((name,"-",now(),"-(", desc, ")"))
    return name

In [None]:
# print(now())
zebra_df = pd.DataFrame()
name_df(zebra_df, 'zebra_df')
print(zebra_df.name)

In [None]:
def create_initial_cancer_dataset():
    # open the  cancer data file
    cancer_df = pd.read_csv(os.path.join(home_dir, data_dir, "cancer_data.csv"))
    
    # convert 'diagnosis' column to a categorical
    cancer_df['diagnosis'] = pd.Categorical(cancer_df['diagnosis'], cancer_categories, ordered=True).codes
    cancer_df = cancer_df.drop(columns=['id'])
    
    name_df(cancer_df, 'cancer_df', 'Original Cancer Data')
    
    return cancer_df

In [None]:
# initalize global environment variables ... 
cwd, home_dir, repo_dir, data_dir, eda_dir, part_dir = setup_environment_variables()

# setup 'cancer_categories' to be used to convert 'B' and 'M' into categorical (numeric) values
cancer_categories = ['B', 'M']
# remeber the indices for B and M (for use in other functions, etc)
B = cancer_categories.index('B')
M = cancer_categories.index('M')

#initialize cancer_df from the raw data file
cancer_df = create_initial_cancer_dataset()
print('cancer_df.name: "{}"'.format(cancer_df.name))

# print(M, B, cancer_categories)

In [None]:
cancer_df.dtypes

In [None]:
cancer_df.describe()

In [None]:
cancer_df.head()

In [None]:
def create_imbalanced_dataset(df, over_balance_on, N=100, verbose=False):
    
    # replicate the starting datafram (df) N times into df2
    if verbose:
        print('replicating base dataframe {} times'.format(N))
    df2 = pd.concat([df for ii in range(N)])

    if verbose:
        print('original dataframe: {} rows, new/temp dataframe: {} rows\n'.format(len(df), len(df2)))
          
    # assuming (for now) that we are blancing relative to a 'diagnosis' (that is binary classification: 0 or 1)
    # validate the the 'over_balance_on' parm .. 
    if over_balance_on == 0:
        minority = 1
    elif over_balance_on == 1: 
        minority = 0
    else:
        print("ERROR: over_balance_on has to be 0 or 1 (binary classificaion only)!")
        return
    # print("valid 'over_balance_on' parameter specifed... ")
          
    majority = over_balance_on
    # minority_st = cancer_categories[minority]
    over_balance_on_st = cancer_categories[over_balance_on]

    print('creating a new dataframe imbalanced on ''diagnosis=="{}"'' ({})'.format(over_balance_on_st, over_balance_on))
          
    # create a new dataframe 'majority_df' by selecting rowes where 'diagnosis==majority' from the 
    # temporary dataframe (which was replicated Nx from the base_df)
    majority_df  = df2.query('diagnosis=={}'.format(majority)) 
    majority_rows = len(majority_df)   
    # print('... {} rows - contianing ''diagnosis=="{}"'' only'.format(majority_rows, over_balance_on_st))
    # print('... added to {} total rose - containg a mix of ''diagnosis''\n'.format(len(df)))
    
    imbalanced_df = df.append(majority_df)
    return imbalanced_df

In [None]:
def print_balance_stats(a):
    b_rows = len(a.query('diagnosis=={}'.format(B)))
    m_rows = len(a.query('diagnosis=={}'.format(M)))
    t_rows = len(a)
    if m_rows > b_rows:
        print("dataframe is over balanced toward '{}' ({:.2F}%)".format(cancer_categories[M], (m_rows/t_rows)*100))
    elif b_rows > m_rows:
        print("dataframe is over balanced toward '{}' ({:.2F}%)".format(cancer_categories[M], (b_rows/t_rows)*100))
    else:
        print("the dataframe is balanced!")
    
    print("B: {}, M: {}, total: {}  ({})".format(b_rows, m_rows, t_rows, (m_rows+b_rows)==t_rows))
    return b_rows, m_rows, t_rows

In [None]:
def balance_dataset(df, verbose=False):
    
    # pass 'balance_dataset' a dataframe that should ideally be imbalanced and 'balance_dataset'
    # will apply Synthetic Minority Over-sampling Technique (aka: SMOTE) to reblance the data 
    # 
    # the reblancing technique involves breaking the dataframe into
    #     y    a 'target_vector' which is essentially the 'diagnosis' colum from 'df'
    #     X    the features matrix which is essentally all the remaining columns in the matrix

    if verbose:
        print("initial balance statistics (before reblancing)")
        print_balance_state(df)
        
    # separate the feature matrix (X) from the 'target vector' (y)
    # WARNING: code below assumes that the 'diagnosis', it the first column () in the datfram
    # should re-write it to work regardless of column order... 
    y = df.iloc[:,0].values
    X = df.iloc[:,1:].values
    
    # apply Synthetic Minority Over-sampling Technique (aka: SMOTE) to reblance the data 
    # (creating a 50/50 ratio of malignant and benign cases)
    
    # note: SMOTE will return "resampled" versions of X and y that have additional entries created
    # to achieve balance
    oversample = imblearn.over_sampling.SMOTE()
    X_resamp, y_resamp = oversample.fit_resample(X, y)
     
    # reassemble the dataframe into 'rebalanced_df' (which will be returned from the function)
    
    # build a list of column names 
    column_names = list(cancer_df.columns)
    if verbose:
        print(column_names)

    # reassemble the dataframe from X_reasmp and y_resamp
    rebalanced_df = pd.DataFrame(np.insert(X_resamp, 0, y_resamp, axis=1), columns=column_names)

    if verbose:
        print("There sould be an equal number of 'benign' and 'malignant' cases after rebalancing... ")
        print("benign:", len(rebalanced_df.query("diagnosis=={}".format(B))))
        print("malignant:", len(rebalanced_df.query("diagnosis=={}".format(M))))
    
    return rebalanced_df

In [None]:
def gen_new_data(N, P):

    for i in range(P):
        malignant_imbalanced = create_imbalanced_dataset(cancer_df, M, N)
        print('malignant_imbalanced: should have M >> B')
        _, _, _ = print_balance_stats(malignant_imbalanced)

        print('\nrebalanced_df: should have M == B')
        rebalanced_df = balance_dataset(malignant_imbalanced)
        _, _, _ = print_balance_stats(rebalanced_df)
        new_df = rebalanced_df.query('diagnosis=={}'.format(B)) 

        benign_imbalanced = create_imbalanced_dataset(cancer_df, B, N)
        print('\nmalignant_imbalanced: should have B >> M')
        _, _, _ = print_balance_stats(benign_imbalanced)

        if verbose:
            print('\nrebalanced_df: should have B == M')
        rebalanced_df = balance_dataset(malignant_imbalanced)
        _, _, _ = print_balance_stats(rebalanced_df)
        new_df = new_df.append(rebalanced_df.query('diagnosis=={}'.format(M)))
    
        pf_name = os.path.join(part_dir, 'foo-{}.csv'.format(str(i).zfill(5)))
        print('\n*** new artfile: {}\n'.format(pf_name))
        new_df.to_csv(pf_name, index=False)
    
    

In [None]:
i = 1
n = 'foo-{}.csv'.format(str(i).zfill(5))
print(i, n)
new_df.to_csv(n, index=False)

In [None]:
def sweetviz(df):
    # create an EDA report for the original dataframe 
    data_report = sv.analyze(df)
    try:
        df_name = df.name
    except:
        df_name = "unamed{}()".format(now())
    
    print(df_name)
    
    data_report.show_html(os.path.join(eda_dir, df_name+'.html'))
    # data_report.show_html(df_name)
    
    # if os.path.isfile(df_name):
    #    os.rename(df_name, os.path.join(eda_dir, df_name+'-'+now()))
        

In [None]:
print("{:,}".format(int('1782992542')))
print("{:,}".format(int('178303155')))

In [None]:
# MISC testing... 

In [None]:
print('before')
_, _, _ = print_balance_stats(cancer_df)
rebalanced_cancer_df = balance_dataset(cancer_df)
print('\nafter')
_, _, _ = print_balance_stats(rebalanced_cancer_df)

In [None]:
# EDA... 

In [None]:
print_balance_stats(cancer_df)

In [None]:
# print(now())
print(cancer_df.name)
sweetviz(cancer_df)

In [None]:
# create an EDA report for the original dataframe 
data_report = sv.analyze(cancer_df)
data_report.show_html('cancer_df.html')

In [None]:
# create an EDA report for the severely imbalanced dataframe (swith N*benign sample appended to the original df)
data_report = sv.analyze(cancer_imbalance_benign_df)
data_report.show_html('cancer_imbalance_benign_df.html')


In [None]:
data_report = sv.analyze(balanced_pd)
data_report.show_html('balanced_pd.html')

In [None]:
malignant_only_df = balanced_pd.query('diagnosis==1')