In [1]:
import os
import numpy as np
import pandas as pd
import os.path as op
import pickle as pkl
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(1010)

# MNIST-like

In [3]:
def prepare_targets(base_dir, funs):
    r = {
        "path": [op.join(base_dir, b) 
     for b in [a for a in os.walk(base_dir)][0][2]]
    }
    r["original_label"] = [
        int(op.splitext(op.split(a)[-1])[0].split("_")[1]) 
                           for a in r["path"]
    ]
    df = pd.DataFrame(r)
    for a in funs:
        df[a[0]] = df["original_label"].apply(a[1])
    return(df)

def relabel(n=0, inliers=True, supervised=True, n_classes=10):
    if supervised:
        w = (0, 1) if inliers else (1, 0)
        return(
            lambda x: w[1] if x == n else w[0]
        )
    else:
        arrangement = np.arange(n_classes)
        substitute = {
            a: -1 if a == n else a-1 if a > n else a for a in arrangement
        }
        return(
            lambda x: substitute[x]
        )

def disbalance_data(labels, percentage):
    normal_indices = np.arange(labels.shape[0])[labels == 0]
    anomal_indices = np.arange(labels.shape[0])[labels == 1]
    what_to_take = int(np.round(normal_indices.shape[0]/(100-percentage)*percentage))
    if what_to_take < anomal_indices.shape[0]:
        _, fraction, _, _ = train_test_split(
            anomal_indices, labels[labels == 1], test_size=what_to_take
        )
    else:
        fraction = anomal_indices
    return(np.concatenate([normal_indices, fraction]))

def train_test(df, percentages):
    r = {}
    for p in percentages:
        for a in tqdm(df.columns[2:]):
            if -1 not in df[a].values:
                r[a+"_"+str(p)] = disbalance_data(
                    df[a].values, p
                )
            else:
                r[a] = np.arange(df.shape[0])[df[a].values != -1]
    return(r)

In [4]:
di_funs = [("SDI_"+str(a), relabel(a)) for a in np.arange(10)]
do_funs = [("SDO_"+str(a), relabel(a, False)) for a in np.arange(10)]
udi_funs = [("UDI_"+str(a), relabel(a, False, False)) for a in np.arange(10)]
udo_funs = [("UDO", lambda x: x) for a in np.arange(1)]

In [5]:
trains = [
    "../data/CIFAR10_train",
    "../data/FMNIST_train",
    "../data/KMNIST_train",
    "../data/MNIST_train",
]

In [6]:
tests = [
    "../data/CIFAR10_test",
    "../data/FMNIST_test",
    "../data/KMNIST_test",
    "../data/MNIST_test",
]

In [7]:
trains_dfs = [
    prepare_targets(
        a, di_funs+do_funs+udi_funs+udo_funs
    ) for a in trains
]
tests_dfs = [
    prepare_targets(
        a, di_funs+do_funs+udi_funs+udo_funs
    ) for a in tests
]

In [8]:
for a,b in list(zip(trains, trains_dfs)):
    print(a)
    b.to_csv(a+".csv")
    with open(a+".pkl", "wb") as oh:
        tt = train_test(b, [10, 1, 0.1])
        pkl.dump(tt, oh)

../data/CIFAR10_train


100%|██████████| 31/31 [00:00<00:00, 1502.76it/s]
100%|██████████| 31/31 [00:00<00:00, 1330.57it/s]
100%|██████████| 31/31 [00:00<00:00, 1342.04it/s]


../data/FMNIST_train


100%|██████████| 31/31 [00:00<00:00, 1229.38it/s]
100%|██████████| 31/31 [00:00<00:00, 1190.57it/s]
100%|██████████| 31/31 [00:00<00:00, 1198.84it/s]


../data/KMNIST_train


100%|██████████| 31/31 [00:00<00:00, 1133.80it/s]
100%|██████████| 31/31 [00:00<00:00, 1106.38it/s]
100%|██████████| 31/31 [00:00<00:00, 1190.96it/s]


../data/MNIST_train


100%|██████████| 31/31 [00:00<00:00, 1376.26it/s]
100%|██████████| 31/31 [00:00<00:00, 1105.26it/s]
100%|██████████| 31/31 [00:00<00:00, 1173.13it/s]


In [9]:
for a,b in tqdm(list(zip(tests, tests_dfs))):
    b.to_csv(a+".csv")

100%|██████████| 4/4 [00:00<00:00, 28.28it/s]


In [10]:
b.columns

Index(['path', 'original_label', 'SDI_0', 'SDI_1', 'SDI_2', 'SDI_3', 'SDI_4',
       'SDI_5', 'SDI_6', 'SDI_7', 'SDI_8', 'SDI_9', 'SDO_0', 'SDO_1', 'SDO_2',
       'SDO_3', 'SDO_4', 'SDO_5', 'SDO_6', 'SDO_7', 'SDO_8', 'SDO_9', 'UDI_0',
       'UDI_1', 'UDI_2', 'UDI_3', 'UDI_4', 'UDI_5', 'UDI_6', 'UDI_7', 'UDI_8',
       'UDI_9', 'UDO'],
      dtype='object')

# Peng et al

In [11]:
PENGPATH = "../raw/PENG/bty558-suppl_data/Supplementary file 1.xlsx"
low = pd.read_excel(PENGPATH, sheet_name=0, engine="openpyxl")
high = pd.read_excel(PENGPATH, sheet_name=1, engine="openpyxl")
neg = pd.read_excel(PENGPATH, sheet_name=2, engine="openpyxl")
low_strs = [a+","+b for a,b in zip(low["on-target site"], low["off-target site"])]
high_strs = [a+","+b for a,b in zip(high["on-target site"], high["off-target site"])]
neg_strs = [a+","+b for a,b in zip(neg["on-target site"], neg["no editing site"])]
low_high = list(set(low_strs+high_strs))
final_neg = neg_strs
pairs = np.array(low_high+final_neg)
labels = np.array([1]*len(low_high)+[0]*len(final_neg))
N_MISMATCHES = 6
n_mms = lambda x,y: np.sum([int(a != b) for a,b in zip(x,y)])
worthy = np.array([n_mms(*a.split(",")) for a in pairs]) < N_MISMATCHES
pairs = pairs[worthy]
labels = labels[worthy]

In [12]:
PENG = pd.DataFrame(
    {
        "path": pairs, "original_label": labels
    }
)

In [13]:
np.random.seed(1010)

In [14]:
train_ix, test_ix = train_test_split(
    np.arange(PENG.shape[0]), stratify=PENG["original_label"].values
)

In [15]:
PENG

Unnamed: 0,path,original_label
0,"GGGTGGGGGGAGTTTGCTCCAGG,GGGAGGGTGGAGTTTGCTCCTGG",1
1,"GTCACCTCCAATGACTAGGGTGG,GTCACTTCCAAGGACTAGAGAAG",1
2,"GAACACAAAGCATAGACTGCGGG,GAATTCAAAGCATAGATTGCAGG",1
3,"GACCCCCTCCACCCCGCCTCCGG,GTCCCCTCCCACCCCGCCTCCAG",1
4,"GGCACTGCGGCTGGAGGTGGGGG,AGCACGGCAGCTGGAGGAGGGGG",1
...,...,...
26648,"TGGATGGAGGAATGAGGAGTTGG,TTTATGGAGGAATGAGGAGATGG",0
26649,"TGGATGGAGGAATGAGGAGTTGG,TTTATGGAGGATTGAGAAGATGG",0
26650,"TGGATGGAGGAATGAGGAGTTGG,TTTATGGAGGGATAAGGAGTGGG",0
26651,"TGGATGGAGGAATGAGGAGTTGG,TTTATTGATGACTGAGGAGTTGG",0


In [16]:
PENG_train = PENG.iloc[train_ix]

In [17]:
PENG_train["random_label"] = PENG_train["original_label"].apply(
    lambda x: np.random.choice([0,1])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PENG_train["random_label"] = PENG_train["original_label"].apply(


In [18]:
PENG_test = PENG.iloc[test_ix]
PENG_test["random_label"] = PENG_test["original_label"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PENG_test["random_label"] = PENG_test["original_label"]


In [19]:
PENG_train.to_csv("../data/PENG_train.csv")
PENG_test.to_csv("../data/PENG_test.csv")

# HAM10000

In [20]:
df = pd.read_csv("../raw/HAM10000.csv", index_col=0)

In [21]:
labels = list(df["dx"].unique())

In [22]:
labels

['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec']

In [23]:
malignant = ["mel", "bcc", "akiec"]
benign = ["nv", "df", "vasc", "bkl"]

In [24]:
if not op.exists("../data/HAM10000"):
    os.makedirs("../data/HAM10000")

In [25]:
newpath = []
for a,b,i in tqdm(zip(df["path"].values, df["dx"].values, np.arange(df.shape[0]))):
    npp = op.join("../data/HAM10000/", str(i)+"_"+str(labels.index(b))+".jpg")
    os.system(
        "cp "+a+" "+npp
    )
    newpath.append(npp)

10015it [00:25, 386.44it/s]


In [26]:
df["path"] = newpath

In [27]:
unique_lesions = df["lesion_id"].unique()

In [28]:
train_ix, test_ix = train_test_split(np.arange(unique_lesions.shape[0]))

In [29]:
train_lesions = unique_lesions[train_ix]
test_lesions = unique_lesions[test_ix]

In [30]:
train_lesions_b = df["lesion_id"].apply(lambda x: x in train_lesions).values

In [31]:
df["original_label"] = df["dx"].apply(lambda x: labels.index(x))

In [32]:
df["malignant"] = df["dx"].apply(lambda x: 1 if x in malignant else 0)
df["benign"] = df["dx"].apply(lambda x: 1 if x in benign else 0)

In [33]:
df["A"] = df["malignant"]
df["B"] = df["dx"].apply(lambda x: 1 if x == "mel" else 0 if x in benign else -1)
df["C"] = df["dx"].apply(lambda x: 1 if x in malignant else 0 if x == "nv" else -1)
df["D"] = df["dx"].apply(lambda x: 1 if x == "mel" else 0 if x == "nv" else -1)

In [34]:
train_df = df.loc[train_lesions_b]
train_df.to_csv("../data/HAM10000_train.csv")

In [35]:
test_df = df.loc[np.logical_not(train_lesions_b)]
test_df.to_csv("../data/HAM10000_test.csv")

In [36]:
train_indices = {
    "A": np.arange(train_df.shape[0]),
    "B": np.arange(train_df.shape[0]),
    "C": np.arange(train_df.shape[0])[train_df["C"].values != -1],
    "D": np.arange(train_df.shape[0])[train_df["D"].values != -1],
}

In [37]:
train_indices

{'A': array([   0,    1,    2, ..., 7502, 7503, 7504]),
 'B': array([   0,    1,    2, ..., 7502, 7503, 7504]),
 'C': array([  43,  873,  874, ..., 7502, 7503, 7504]),
 'D': array([  43,  873,  874, ..., 7252, 7253, 7504])}

In [38]:
with open("../data/HAM10000_train.pkl", "wb") as oh:
    pkl.dump(train_indices, oh)