# <span style="color:white; font-family:PT Sans Narrow; font-size:1.1em"> Import Packages </span>

In [7]:
import os, sys 
data_dir = os.path.dirname(os.path.realpath('.'))
sys.path.append(data_dir)


from crowdkit.datasets import load_dataset
import pandas as pd
import ipywidgets

import load_data 



print('data_dir:', data_dir)

data_dir: /home/u29/mohammadsmajdi/projects/chest_xray/crowd-kit


In [4]:
    
def aim1_3_read_download_UCI_database(WHICH_DATASET=5, mode='read'):

    local_parent_path = data_dir + '/data_mine'

    def read_raw_names_files(WHICH_DATASET=1):

        main_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/'


        if WHICH_DATASET in (1,'kr-vs-kp'):
            dataset = 'kr-vs-kp'
            names   = [f'a{i}' for i in range(0,36)] + ['true']
            files   = ['Index', f'{dataset}.data', f'{dataset}.names']
            url     = main_url + '/chess/king-rook-vs-king-pawn/'

        elif WHICH_DATASET in (2,'mushroom'):
            dataset = 'agaricus-lepiota'
            names   = ['true'] + [f'a{i}' for i in range(0,22)]
            files   = ['Index', f'{dataset}.data', f'{dataset}.names']
            url     = main_url + '/mushroom/'

        elif WHICH_DATASET in (3,'sick'):
            dataset = 'sick'
            names   = [f'a{i}' for i in range(0,29)] + ['true']
            files   = [f'{dataset}.data', f'{dataset}.names', f'{dataset}.test']
            url     = main_url + '/thyroid-disease/'

        elif WHICH_DATASET in (4,'spambase'):
            dataset = 'spambase'
            names   = [f'a{i}' for i in range(0,57)] + ['true']
            files   = [f'{dataset}.DOCUMENTATION', f'{dataset}.data', f'{dataset}.names', f'{dataset}.zip']
            url     = main_url + '/spambase/'
            
        elif WHICH_DATASET in (5,'tic-tac-toe'):
            dataset = 'tic-tac-toe'
            names   = [f'a{i}' for i in range(0,9)] + ['true']
            files   = [f'{dataset}.data', f'{dataset}.names']
            url     = main_url + '/tic-tac-toe/'

        elif WHICH_DATASET in (6, 'splice'):
            # dataset = 'splice'
            # url = main_url + '/molecular-biology/splice-junction-gene-sequences/'
            pass

        elif WHICH_DATASET in (7,'thyroid'):
            pass
        
        elif WHICH_DATASET in (8,'waveform'):
            dataset = 'waveform'
            names   = [f'a{i}' for i in range(0,21)] + ['true']
            files   = [ 'Index', f'{dataset}-+noise.c', f'{dataset}-+noise.data.Z', f'{dataset}-+noise.names', f'{dataset}.c', f'{dataset}.data.Z', f'{dataset}.names']
            url     = main_url + '/mwaveform/'

        elif WHICH_DATASET in (9,'biodeg'):
            dataset = 'biodeg'
            names   = [f'a{i}' for i in range(0,41)] + ['true']
            files   = [f'{dataset}.csv']
            url     = main_url + '/00254/'

        elif WHICH_DATASET in (10,'horse-colic'):
            dataset = 'horse-colic'
            names   = [f'a{i}' for i in range(0,41)] + ['true']
            files   = [f'{dataset}.data', f'{dataset}.names', f'{dataset}.names.original', f'{dataset}.test']
            url     = main_url + '/horse-colic/'
            
        elif WHICH_DATASET in (11,'ionosphere'):
            dataset = 'ionosphere'
            names   = [f'a{i}' for i in range(0,34)] + ['true']
            files   = [ 'Index', f'{dataset}.data', f'{dataset}.names']
            url     = main_url + '/ionosphere/'

        elif WHICH_DATASET in (12,'vote'):
            pass  

        return dataset, names, files, url

    def read_data(local_parent_path='', WHICH_DATASET=0):

        def postprocess(data_raw=[], names=[], WHICH_DATASET=0):

            def replacing_classes_char_to_int(data_raw=[], feature_columns=[]):
                
                # finding the unique classes
                lbls = set()
                for fx in feature_columns:
                    lbls = lbls.union(data_raw[fx].unique())

                # replacing the classes from char to int
                for ix, lb in enumerate(lbls):
                    data_raw[feature_columns] = data_raw[feature_columns].replace(lb,ix+1)

                return data_raw
                
            feature_columns = names.copy()
            feature_columns.remove('true')

            if WHICH_DATASET in (1,'kr-vs-kp'):

                # changing the true labels from string to [0,1]
                data_raw.true = data_raw.true.replace('won',1).replace('nowin',0)

                # replacing the classes from char to int
                data_raw = replacing_classes_char_to_int(data_raw, feature_columns)

            elif WHICH_DATASET in (2,'mushroom'):
                
                # changing the true labels from string to [0,1]
                data_raw.true = data_raw.true.replace('e',1).replace('p',0)

                # feature a10 has missing data
                data_raw.drop(columns=['a10'], inplace=True)
                feature_columns.remove('a10')

                # replacing the classes from char to int
                data_raw = replacing_classes_char_to_int(data_raw, feature_columns)

            elif WHICH_DATASET in (3,'sick'):
                data_raw.true = data_raw.true.map(lambda x: x.split('.')[0]).replace('sick',1).replace('negative',0)
                data_raw = data_raw.replace('?',np.nan).drop(columns=['a27'])

            elif WHICH_DATASET in (4,'spambase'): 
                pass

            elif WHICH_DATASET in (5,'tic-tac-toe'):
                # renaming the two classes "good" and "bad" to "0" and "1"
                data_raw.true = data_raw.true.replace('negative',0).replace('positive',1)
                data_raw[feature_columns] = data_raw[feature_columns].replace('x',1).replace('o',2).replace('b',0)

            elif WHICH_DATASET in (6, 'splice'):
                pass 

            elif WHICH_DATASET in (7,'thyroid'):
                pass  

            elif WHICH_DATASET in (8,'waveform'):
                # extracting only classes "1" and "2" to correspond to Tao et al paper
                class_0 = data_raw[data_raw.true == 0].index
                data_raw.drop(class_0, inplace=True)
                data_raw.true = data_raw.true.replace(1,0).replace(2,1)

            elif WHICH_DATASET in (9,'biodeg'):
                data_raw.true = data_raw.true.replace('RB',1).replace('NRB',0)

            elif WHICH_DATASET in (10,'horse-colic'): 
                pass

            elif WHICH_DATASET in (11,'ionosphere'):
                data_raw.true = data_raw.true.replace('g',1).replace('b',0)

            elif WHICH_DATASET in (12,'vote'):
                pass

            return data_raw, feature_columns

        def separate_train_test(data_raw=[], train_frac=0.8):
            data = {}
            data['train'] = data_raw.sample(frac=train_frac).sort_index()
            data['test']  = data_raw.drop(data['train'].index)
            
            return data

            
        dataset, names, _, _ = read_raw_names_files(WHICH_DATASET=WHICH_DATASET)


        if dataset == 'biodeg':        
            command = {'filepath_or_buffer': local_parent_path + f'/UCI_{dataset}/{dataset}.csv', 'delimiter':';'}

        elif dataset == 'horse-colic': 
            command = {'filepath_or_buffer': local_parent_path + f'/UCI_{dataset}/{dataset}.data', 'delimiter':' ', 'index_col':None}

        else:                   
            command = {'filepath_or_buffer': local_parent_path + f'/UCI_{dataset}/{dataset}.data'}
                            
        if mode == 'read':
            data_raw = pd.read_csv(**command, names=names)
            data_raw, feature_columns = postprocess(data_raw=data_raw, names=names, WHICH_DATASET=WHICH_DATASET)

        elif mode == 'read_raw':
            data_raw, feature_columns = pd.read_csv(**command) , []

        data = separate_train_test(data_raw=data_raw, train_frac=0.8)

        return data, feature_columns


    if   'download' in mode: 
        return download_data(local_parent_path=local_parent_path)
        
    elif 'read'     in mode: 
        return read_data(    local_parent_path=local_parent_path, WHICH_DATASET=WHICH_DATASET)


# <span style="color:white; font-family:PT Sans Narrow; font-size:1.2em"> 1. Dataset </span>

## <span style="color:Green; font-family:PT Sans Narrow; font-size:1.1em"> 1.1 Selecting the Dataset </span>

In [8]:
dataset = ipywidgets.Dropdown( options = [ ('1. kr-vs-kp'    ,'kr-vs-kp'), 
                                           ('2. mushroom'    ,'mushroom'),
                                           ('3. sick'        ,'sick'),
                                           ('4. spambase'    ,'spambase'),
                                           ('5. tic-tac-toe' ,'tic-tac-toe'),
                                           ('6. splice'      ,'splice'),
                                           ('8. waveform'    ,'waveform'),
                                           ('9. biodeg'      ,'biodeg'),
                                           ('10. horse-colic','horse-colic'),
                                           ('11. ionosphere' ,'ionosphere'),
                                           ('12. vote'       ,'vote')],      
                                value = 'vote')


@ipywidgets.interact(WHICH_DATASET = dataset)
def read_data(WHICH_DATASET):
    if WHICH_DATASET in ['sick','splice','biodeg','vote','horse-colic']:
        print('dataset does not exist')

    else:
        data, feature_columns = load_data.aim1_3_read_download_UCI_database(WHICH_DATASET=WHICH_DATASET, mode='read')
        print(data['train'].head(3))
        print('train shape:',data['train'].shape)


# WHICH_DATASET = 'ionosphere'
# data, feature_columns = aim1_3_read_download_UCI_database(WHICH_DATASET=WHICH_DATASET, mode='read')
# print(data['train'].head(3))
# print('train shape:',data['train'].shape)

interactive(children=(Dropdown(description='WHICH_DATASET', index=10, options=(('1. kr-vs-kp', 'kr-vs-kp'), ('…

## <span style="color:Green; font-family:PT Sans Narrow; font-size:1.3em"> 1.1 Preprocessing the data to adapt to this package </span>

In [9]:
datasets_list = ['TlkAgg-Categorical']
data_mode = 'synthetic' # 'synthetic' or 'real'

df, df_gt = load_dataset('relevance-2')