In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.metrics import roc_auc_score, log_loss, roc_curve, auc

import sys
sys.path.append('../')

from melanoma.utils import model_utils
from melanoma.utils import generic_utils as utils

TEXT_COLOR = 'w'
FACECOLOR = '#373d4b'
mpl.rcParams['axes.facecolor'] = '#373d4b'
mpl.rcParams['text.color'] = TEXT_COLOR
mpl.rcParams['axes.labelcolor'] = TEXT_COLOR
mpl.rcParams['xtick.color'] = TEXT_COLOR
mpl.rcParams['ytick.color'] = TEXT_COLOR

import albumentations
from melanoma.data.dataset import MelanomaDataset, TileDataset
from melanoma.data.augmentation import MelanomaAugmentor
from melanoma.utils import data_utils
from melanoma.utils.data_utils import load_data
from skimage.measure import regionprops
from melanoma.data.preprocess import get_crop_coords


In [2]:
IMG_SIZE = 224
image_map={
    'ISIC_2020': f'../data/ISIC_2020/{IMG_SIZE}x{IMG_SIZE}_jpg_100_4/',
    'ISIC_2019': f'../data/ISIC_2019/{IMG_SIZE}x{IMG_SIZE}_jpg_100_4/'
}
df_train = data_utils.load_data(
    '../data/train.csv', 
    duplicate_path='../data/sus/2020_Challenge_duplicates.csv', 
    external_filepaths=['../data/external_data/isic_2019.csv'],
    image_map=image_map
)

df_test = data_utils.load_data(
    '../data/test.csv', 
    duplicate_path='../data/sus/2020_Challenge_duplicates.csv', 
#     external_filepaths=['../data/external_data/isic_2019.csv'],
    image_map=image_map
)
df_test['target'] = None
df_test['fold'] = 'test'

cv_folds = data_utils.load_cv_folds('../data/cv_folds/cv_folds_20200802_0048/cv_folds.p')
df_train['fold'] = data_utils.get_fold_col(df_train, cv_folds)

df_mela = pd.concat((df_train, df_test), ignore_index=True)
root = f'../data/ISIC_2020/{IMG_SIZE}x{IMG_SIZE}_jpg_100_4/'
df_mela['image_dir'] = df_mela['source'].map(image_map)

fold = 0
step = None
# model_dir = '../attn_models_careaga/attn_exp_1/resnet34_bce_smth_{IMG_SIZE}_128_max_adam_one_cycle_20200812_0202/'
# ckpt_dir = os.path.join(model_dir, f'fold_{fold}')
# model = model_utils.load_model(ckpt_dir, step=step)

# df_train = df_mela.loc[~(df_mela['fold'].isin([fold, 'holdout']))].reset_index(drop=True)
# df_val = df_mela.loc[df_mela['fold'] == fold].reset_index(drop=True)

img_stats = data_utils.load_img_stats(f'../data/cv_folds/cv_folds_20200802_0048/{IMG_SIZE}x{IMG_SIZE}_jpg_100_4/', fold)
mean = np.array(img_stats['source']['ISIC_2020']['mean'], np.float32)
std = np.array(img_stats['source']['ISIC_2020']['std'], np.float32)

df_mela['image_dir'] = [
    os.path.join(row.image_dir, 'test') if row.fold == 'test' else os.path.join(row.image_dir, 'train') 
    for row 
    in df_mela.itertuples() 
]
# config = utils.load_config_from_yaml(os.path.join(model_dir, 'config.json'))

Adding external data from ../data/external_data/isic_2019.csv
Loading img stats for fold 0 from ../data/cv_folds/cv_folds_20200802_0048/224x224_jpg_100_4/fold_0/img_stats.json


In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [30]:
def preprocess_meta(df_train, meta_cols, eval_dfs=None, cat_cols=None, output_dir=None):
    if eval_dfs is None:
        eval_dfs = []
    elif not isinstance(eval_dfs, list):
        eval_dfs = [eval_dfs]
    if cat_cols is None:
        cat_cols = [c for c in meta_cols if df_train[c].nunique() < 10]
    print(cat_cols)
    for col in meta_cols:
        if col in cat_cols:
            print(f'Preprocessing categorical col `{col}`')
            fill_value = df_train[col].mode().iloc[0]
            print(f'Filling most frequent value : {fill_value}')
            missing_idx = (
                (df_train[col].isnull()) | 
                (df_train[col] == 'unknown') |
                (df_train[col] == 'missing')
            )
            df_train.loc[missing_idx, col] = fill_value
            print('Applying label encoder.')
            le = LabelEncoder()
            df_train[col] = le.fit_transform(df_train[col])
            for df in eval_dfs:
                missing_idx = (
                    (df[col].isnull()) | 
                    (df[col] == 'unknown') |
                    (df[col] == 'missing')
                )
                df.loc[missing_idx, col] = fill_value
                df[col] = le.transform(df[col])
            # save encoder to disk
            if output_dir is not None:
                with open(os.path.join(output_dir, f'{col}_encoder.p'), 'wb') as f:
                    pickle.dump(le, f)
        else:
            print(f'Preprocessing continuous col `{col}`')
            print(df_train[col].head())
            print(df_train[col].dtype)
            fill_value = df_train[col].mean()
            print(f'Filling mean value : {fill_value}')
            missing_idx = (
                (df_train[col].isnull()) | 
                (df_train[col] == 'unknown') |
                (df_train[col] == 'missing')
            )
            df_train.loc[missing_idx, col] = fill_value
            print('Applying standardized scalar.')
            scl = StandardScaler()
            df_train[col] = scl.fit_transform(df_train[col].values.reshape(-1, 1)).ravel()
            for df in eval_dfs:
                missing_idx = (
                    (df[col].isnull()) | 
                    (df[col] == 'unknown') |
                    (df[col] == 'missing')
                )
                df.loc[missing_idx, col] = fill_value
                df[col] = scl.transform(df[col].values.reshape(-1, 1)).ravel()
            # save scalar to disk
            if output_dir is not None:
                with open(os.path.join(output_dir, f'{col}_scl.p'), 'wb') as f:
                    pickle.dump(scl, f)
                    
    return (df_train, *eval_dfs)

In [31]:
a,b = preprocess_meta(df_a.copy(), meta_cols=['sex', 'anatom_site_general_challenge', 'age_approx'], eval_dfs=df_b.copy())

['sex', 'anatom_site_general_challenge']
Preprocessing categorical col `sex`
Filling most frequent value : male
Applying label encoder.
Preprocessing categorical col `anatom_site_general_challenge`
Filling most frequent value : torso
Applying label encoder.
Preprocessing continuous col `age_approx`
0    45.0
1    45.0
2    50.0
3    45.0
4    55.0
Name: age_approx, dtype: float64
float64
Filling mean value : 51.06957307431669
Applying standardized scalar.


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


In [29]:
a

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,source,target,image_dir,fold
0,ISIC_2637011,IP_7279968,1,45,0,ISIC_2020,0,../data/ISIC_2020/224x224_jpg_100_4/,0
1,ISIC_0015719,IP_3075186,0,45,6,ISIC_2020,0,../data/ISIC_2020/224x224_jpg_100_4/,2
2,ISIC_0052212,IP_2842074,0,50,1,ISIC_2020,0,../data/ISIC_2020/224x224_jpg_100_4/,4
3,ISIC_0068279,IP_6890425,0,45,0,ISIC_2020,0,../data/ISIC_2020/224x224_jpg_100_4/,holdout
4,ISIC_0074268,IP_8723313,0,55,6,ISIC_2020,0,../data/ISIC_2020/224x224_jpg_100_4/,6
...,...,...,...,...,...,...,...,...,...
58027,ISIC_0073247,08ae4f28-3040-42bd-b346-9ee2c99331f2,0,85,0,ISIC_2019,0,../data/ISIC_2019/224x224_jpg_100_4/,
58028,ISIC_0073248,65cc058b-4357-4f8a-af72-7ae10cd5cb82,1,65,5,ISIC_2019,0,../data/ISIC_2019/224x224_jpg_100_4/,
58029,ISIC_0073249,769482d7-6a00-4d8f-8785-4f9370de2e4e,1,70,1,ISIC_2019,1,../data/ISIC_2019/224x224_jpg_100_4/,
58030,ISIC_0073251,ea845694-45ac-45cd-9b43-3c96f87de45d,0,55,4,ISIC_2019,0,../data/ISIC_2019/224x224_jpg_100_4/,


In [5]:
df_train['age_approx'].dtype

dtype('float64')

In [6]:
df_a = df_train.copy()
df_b = df_test.copy()
df_holdout = df_train.loc[df_train['fold'] == 'holdout']
df_c = df_holdout.copy()

In [7]:
df_train['age_approx']

dtype('float64')

In [86]:
a,b = preprocess_meta(df_a.copy(), meta_cols=meta_cols, eval_dfs=df_b)

['sex', 'anatom_site_general_challenge']
Preprocessing categorical col `sex`
Filling most frequent value : male
Applying label encoder.
Preprocessing continuous col `age_approx`
0    45
1    45
2    50
3    45
4    55
Name: age_approx, dtype: object
object


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [70]:
df_holdout = df_train.loc[df_train['fold'] == 'holdout']

In [75]:
df_train['age_approx'].mean()

51.06957307431669

In [None]:
df =pd.read_csv('../input/fe-using-only-competition-data-melanoma/melanoma_folds.csv')

df.drop(['image_id','stratify_group','center','diagnosis','benign_malignant'],axis=1,inplace=True)
target = 'target'
unused_feat = ['patient_id','fold']
features = [ col for col in df.columns if col not in unused_feat+[target]] 

categorical_columns = []

for col in df.columns[df.dtypes == object]:
    
    if col not in unused_feat:
        print(col, df[col].nunique())
        
        l_enc = LabelEncoder()
        df[col] = l_enc.fit_transform(df[col].values)
        
        #SAVING LABEL _ ENC
        output = open(f'{col}_encoder.pkl', 'wb')
        pickle.dump(l_enc, output)
        output.close()
        
        categorical_columns.append(col)