In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

In [3]:
PROJECT_DIR = os.path.abspath('.')
if PROJECT_DIR.endswith('final-nbs'):
    PROJECT_DIR = os.path.abspath('../')
    os.chdir(PROJECT_DIR)

In [4]:
import cfg
from src.data import get_features_path_from_metadata, join_dataframe_columns
from src import util
from src.data import setup_directories

util.setup_logging()

dirs = setup_directories(cfg.DATA_DIR, create_dirs=True)

In [5]:
def build_multiclassifcation_target(data: pd.DataFrame, target_name: str) -> pd.Series:
    n_act_targets = data[cfg.TARGETS].sum(axis=1)
    only_target = (n_act_targets == 1) & (data[target_name] == 1)
    multiple_targets = (n_act_targets > 1) & (data[target_name] == 1)
    other_targets = (n_act_targets > 0) & (data[target_name] == 0)
    none_targets = (n_act_targets  == 0) & (data[target_name] == 0)
    
    assert (only_target & multiple_targets).sum() == 0
    assert (only_target & other_targets).sum() == 0
    assert (only_target & none_targets).sum() == 0
    
    multiclass_target = np.full(len(data), np.nan)
    multiclass_target[none_targets] = 0
    multiclass_target[other_targets] = 1
    multiclass_target[only_target] = 2
    multiclass_target[multiple_targets] = 3
    
    assert np.isnan(multiclass_target).sum() == 0
    
    return multiclass_target.astype(np.int64)

In [6]:
raw_dir = Path(dirs['raw'])

In [7]:
pd_train_data = pd.read_csv(raw_dir / 'train_labels.csv', index_col='sample_id')
pd_valid_data = pd.read_csv(raw_dir / 'val_labels.csv', index_col='sample_id')

In [8]:
pd_train_data.head()

Unnamed: 0_level_0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0000,0,0,0,0,0,0,0,0,1,0
S0001,0,1,0,0,0,0,0,0,0,0
S0002,0,0,0,0,0,1,0,0,0,0
S0003,0,1,0,1,0,0,0,0,1,0
S0004,0,0,0,1,0,1,1,0,0,0


In [9]:
multiclass_targets = pd.DataFrame(index=pd_train_data.index, columns=pd_train_data.columns)#.add_suffix('_mutliclass')

In [10]:
for target_name in cfg.TARGETS:
    multiclass_targets[target_name] = build_multiclassifcation_target(pd_train_data, target_name)

In [11]:
multiclass_targets = multiclass_targets.add_suffix('_multiclass')

In [12]:
valid_multiclass_targets = pd.DataFrame(index=pd_valid_data.index, columns=pd_valid_data.columns)#.add_suffix('_mutliclass')

for target_name in cfg.TARGETS:
    valid_multiclass_targets[target_name] = build_multiclassifcation_target(pd_valid_data, target_name)

valid_multiclass_targets = valid_multiclass_targets.add_suffix('_multiclass')

In [13]:
multiclass_targets.nunique()

basalt_multiclass            4
carbonate_multiclass         4
chloride_multiclass          4
iron_oxide_multiclass        4
oxalate_multiclass           3
oxychlorine_multiclass       4
phyllosilicate_multiclass    4
silicate_multiclass          4
sulfate_multiclass           4
sulfide_multiclass           4
dtype: int64

In [16]:
valid_multiclass_targets.nunique()

basalt_multiclass            4
carbonate_multiclass         4
chloride_multiclass          4
iron_oxide_multiclass        4
oxalate_multiclass           3
oxychlorine_multiclass       4
phyllosilicate_multiclass    4
silicate_multiclass          4
sulfate_multiclass           4
sulfide_multiclass           4
dtype: int64

In [17]:
train_dir = Path(dirs['train'])

In [18]:
multiclass_targets.to_csv(train_dir / 'multiclass.csv', index=True)

In [19]:
valid_dir = Path(dirs['valid'])

valid_multiclass_targets.to_csv(valid_dir / 'multiclass.csv', index=True)