This is used to calculate the value of scores and accuracy of each individual in chunks.

In [None]:
##################################################
########## Loading Necessary Libraries  ##########
##################################################

import pandas as pd
import numpy as np
import multiprocessing as mp
p = mp.Pool(mp.cpu_count())
%matplotlib inline

import time
import os

from Calculator import Calculator
from Calculator import convert_str_to_list
from Calculator import convert_list_to_str

In [None]:
##################################################
########### Loading and Cleaning Data ############
##################################################

# # Load data
data = pd.read_csv('data.csv')

# Extracting the target variable for final accuracy calculation
y = data['diagnosis']

##### Preprocessing the data #####
X = data.drop(['id', 'diagnosis','Unnamed: 32'], axis=1)

# Cleaning the data 
corr = X.corr()
corr_t = corr.abs().unstack()
corr_t = corr_t.sort_values(kind="quicksort")
col_to_drop = []
for i in range(1, len(corr_t)):
    if corr_t[i] > 0.9 and corr_t.index[i][0] != corr_t.index[i][1]:
        col_to_drop.append([corr_t.index[i][0], corr_t.index[i][1]])
# Calculate total correlation of each column in col_to_drop
total_corrs = corr.abs().sum(axis=1).sort_values(ascending=False)
droped_cols = []
for i in col_to_drop:
    if(i[0] in droped_cols or i[1] in droped_cols):
        continue
    elif total_corrs[i[0]] > total_corrs[i[1]]:
        X.drop(i[1], axis=1, inplace=True)
        droped_cols.append(i[1])
    else:
        X.drop(i[0], axis=1, inplace=True)
        droped_cols.append(i[0])

##### Cleaning and standardizing the data #####
y = data['diagnosis']
df_Y = pd.DataFrame(y)
df_Y['diagnosis'] = df_Y['diagnosis'].apply(lambda x: 1 if x == 'M' else 2)
y = df_Y

max_correlation = 5
corr = X.corr()
# Choose the highest correlation columns
highest_corr_col = corr.abs().sum(axis=1).sort_values(ascending=False).head(1)
while(highest_corr_col[0] > max_correlation):
    # Drop the highest correlation columns
    X = X.drop(highest_corr_col.index, axis=1)
    corr = X.corr()
    highest_corr_col = corr.abs().sum(axis=1).sort_values(ascending=False).head(1)

# convert y as df to a np.array
np_y = np.array(y)

# Get the length of y
np_y_n = len(np_y)

In [None]:
def process_df(calculator, df, filename):
    t1 = time.time()
    print("Calculating R2 Score of " + filename + " ...         ", end="\r")
    df['mcc'] = p.map(calculator.calculate_R2_score_apply, df['individual'])
    t2 = time.time()
    print("Calculating Total MI of " + filename + " , R2 time: ", round(t2-t1, 2) , "s ...", end="\r")
    df['total_mi'] = p.map(calculator.calculate_total_mi, df['individual'])
    t3 = time.time()
    print("Calculating Accuracy of " + filename + " , R2 time: ", round(t2-t1, 2) , "s, Total MI time: ", round(t3-t2, 2) , "s ...", end="\r")
    df['accuracy'] = p.map(calculator.calculate_accuracy, df['individual'])
    t4 = time.time()
    print("Calculating Accuracy of " + filename + " , R2 time: ", round(t2-t1, 2) , "s, Total MI time: ", round(t3-t2, 2) , "s, Accuracy time: ", round(t4-t3, 2) , "s ...")
    return df


In [None]:
print("Creating Calculator Object ... ")
calculator = Calculator(X, y, np_y, np_y_n)

for i in range(0, 72):
    # if file does not exist, skip
    if not os.path.exists('chunks/data_chunked-1_' + str(i) + '.csv'):
        print("File: data_chunked-1_" + str(i) + ".csv does not exist, Continuing ... ")
        continue
    else:
        print("Loading File: data_chunked-1_" + str(i) + ".csv ... ", end="\r")
        file = pd.read_csv('chunks/data_chunked-1_' + str(i) + '.csv')
        print("Converting String to List ...                    ", end="\r")
        file['individual'] = p.map(convert_str_to_list, file['individual'])
        print("Processing Data ... ", end="\r")
        file = process_df(calculator, file, "data_chunked-1_" + str(i) + ".csv")
        print("Converting List to String ...                    ", end="\r")
        file['individual'] = file['individual'].apply(lambda x: convert_list_to_str(x))
        print("Saving File: data_processed_chunked_" + str(i) + ".csv ... ", end="\r")
        file.to_csv('processed_chucks/data_processed_chunked-1_' + str(i) + '.csv', index=False)
        print("Removing Unnecessary Columns ...                 ", end="\r")
        file = file.drop(['individual'], axis=1)
        print("Saving File: data_chunked-1_" + str(i) + ".csv ... ", end="\r")
        file.to_csv('chunks_result/data_processed_chunked-1_' + str(i) + '.csv', index=False)
        print("Removing File: data_chunked-1_" + str(i) + ".csv ... ", end="\r")
        os.remove('chunks/data_chunked-1_' + str(i) + '.csv')
        del file
