In [None]:
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np
from sklearn import preprocessing as pp
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import sklearn
import pickle

In [None]:
def get_class_label(df):
    string_class = list(df["interaction"].unique())

    class_dic = {}

    for i in range(len(string_class)):
        class_dic[string_class[i]] = i

    class_label = []

    for i in range(len(df)):
        class_label.append(class_dic[df['interaction'][i]])
        
    with open('../Data/class_dic.pickle', 'wb') as fw:
        pickle.dump(class_dic, fw)
        
    return class_label

def get_embedded_matrix(embedding_dic_path, MAX_LENGTH):
    with open(embedding_dic_path, 'rb') as fr:
        embedded_dic_upper = pickle.load(fr)

    embedded_dic = {}

    for key in embedded_dic_upper.keys():
        embedded_dic[key.lower()] = embedded_dic_upper[key]

    del embedded_dic_upper

    drugs = list(embedded_dic.keys())
    vocab_size = len(embedded_dic)

    drug_index = {}
    i = 0

    for key in embedded_dic.keys():
        drug_index[key.lower()] = i
        i = i + 1

    count = []

    for k in embedded_dic.keys():
        count.append(len(embedded_dic[k]))

    MAX_LENGTH = MAX_LENGTH

    remove = []

    for i in tqdm_notebook(embedded_dic.keys()):
        if len(embedded_dic[i]) > MAX_LENGTH:
            print(i)
            remove.append(str(i))

    for i in remove:
        del embedded_dic[i]

    for i in remove:
        del drug_index[i]

    embedded_list = []
    for i in tqdm_notebook(embedded_dic.keys()):
        embedded_list.append(embedded_dic[i])

    pad = embedded_list[0][-1]

    mx = 0

    for i in embedded_list:
        if len(i) >= mx:
            mx = len(i)

    for i in tqdm_notebook(range(len(embedded_list))):
        while len(embedded_list[i]) < mx:
            embedded_list[i].append(pad)

    embedded_matrix = np.zeros(shape=(2080, MAX_LENGTH, 700))

    for i in range(len(embedded_list)):
        for j in range(len(embedded_list[i])):
            embedded_list[i][j] = embedded_list[i][j].squeeze().tolist()

    for i in range(len(embedded_list)):
        embedded_list[i] = np.array(embedded_list[i])

    for i in tqdm_notebook(range(len(embedded_matrix))):
        embedded_matrix[i] = embedded_list[i]
        
    return embedded_matrix, remove, drug_index

def clean_df(df_use, remove):
    idx = []
    for i in remove:
        idx.append(list(df_use.index[df_use['DrugA'] == i]))
    idx = sum(idx, [])
    df_use = df_use.drop(idx, axis=0)

    idx = []
    for i in remove:
        idx.append(list(df_use.index[df_use['DrugB'] == i]))
    idx = sum(idx, [])    
    df_cleaned = df_use.drop(idx, axis=0)
    df_cleaned = df_cleaned.reset_index(drop=True)
    
    return df_cleaned

def get_embedded_df(df_cleaned, embedded_matrix, drug_index):
    temp = []

    for i in tqdm_notebook(range(len(df_cleaned))):
        temp.append(embedded_matrix[drug_index[df_cleaned['DrugA'][i]]])
    df_cleaned["DrugA"] = temp
    temp = []

    for i in tqdm_notebook(range(len(df_cleaned))):
        temp.append(embedded_matrix[drug_index[df_cleaned['DrugB'][i]]])
    df_cleaned["DrugB"] = temp

    del temp
    
    return df_cleaned

def cut_sample(count_sample_df, cut_thshold, embedded_df):
    under_idx = count_sample_df[count_sample_df['count'] < cut_thshold].index
    over_cut = embedded_df[~embedded_df.class_label.isin(under_idx)]
    over_cut = sklearn.utils.shuffle(over_cut, random_state = random_seed)
    
    return over_cut

def data_generate_save(over_thshold, MAX_LENGTH, test_size = 0.3, val_size = 0.3):
    num_class = len(pd.DataFrame(over_thshold['class_label'].value_counts()))

    x = over_thshold[["DrugA", "DrugB"]]
    y = over_thshold['class_label']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = random_seed) 
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state = random_seed)
    
    train_idx = x_train.index.tolist()
    val_idx = x_val.index.tolist()
    test_idx = x_test.index.tolist()

    train_A = np.zeros(shape=(len(x_train),MAX_LENGTH,700))
    train_B = np.zeros(shape=(len(x_train),MAX_LENGTH,700))
    test_A = np.zeros(shape=(len(x_test),MAX_LENGTH,700))
    test_B = np.zeros(shape=(len(x_test),MAX_LENGTH,700))
    val_A = np.zeros(shape=(len(x_val),MAX_LENGTH,700))
    val_B = np.zeros(shape=(len(x_val),MAX_LENGTH,700))
    
    for i in tqdm_notebook(range(len(train_idx))):
        train_A[i] = x_train['DrugA'][train_idx[i]]

    for i in tqdm_notebook(range(len(train_idx))):
        train_B[i] = x_train['DrugB'][train_idx[i]]
    
    print('Saving training set...')
    np.savez('../Generated_data/train_set.npz', train_A = train_A, train_B = train_B)
    del train_A, train_B
    
    for i in tqdm_notebook(range(len(test_idx))):
        test_A[i] = x_test['DrugA'][test_idx[i]]

    for i in tqdm_notebook(range(len(test_idx))):
        test_B[i] = x_test['DrugB'][test_idx[i]]
    
    print('Saving test set...')
    np.savez('../Generated_data/test_set.npz', test_A = test_A, test_B = test_B)
    del test_A, test_B
    
    for i in tqdm_notebook(range(len(val_idx))):
        val_A[i] = x_val['DrugA'][val_idx[i]]
    
    for i in tqdm_notebook(range(len(val_idx))):
        val_B[i] = x_val['DrugB'][val_idx[i]]
    
    print('Saving validation set...')
    np.savez('../Generated_data/val_set.npz', val_A = val_A, val_B = val_B)
    del val_A, val_B
    
    one_hot_train = pd.get_dummies(y_train)
    one_hot_test = pd.get_dummies(y_test)
    one_hot_val = pd.get_dummies(y_val)
    
    print('Saving one hot data...')
    one_hot_test.to_csv('../Generated_data/one_hot_test.csv', header = True, index = True)
    one_hot_val.to_csv('../Generated_data/one_hot_val.csv', header = True, index = True)
    one_hot_train.to_csv('../Generated_data/one_hot_train.csv', header = True, index = True)
    
def generate_data(max_sample_num, input_sequence_length, df):
    class_label = get_class_label(df)
    df_use = pd.concat([df.iloc[:,:2], pd.DataFrame(class_label, columns=["class_label"])], axis=1)
    embedded_matrix, remove, drug_index = get_embedded_matrix(embedding_dic_path, input_sequence_length)
    df_use['DrugA'] = df_use['DrugA'].apply(lambda x: x.lower())
    df_use['DrugB'] = df_use['DrugB'].apply(lambda x: x.lower())
    df_cleaned = clean_df(df_use, remove)
    df_cleaned.to_csv('../Generated_data/base_df.csv')
    embedded_df = get_embedded_df(df_cleaned, embedded_matrix, drug_index)
    count_sample_df = pd.DataFrame(embedded_df['class_label'].value_counts())
    over_thshold = cut_sample(count_sample_df, max_sample_num, embedded_df)
    data_generate_save(over_thshold, input_sequence_length)
    
random_seed = 0
# 클래스 레이블 그루핑 완료된 데이터프레임
df = pd.read_csv('../Data/DDI.csv')
input_sequence_length = 128
max_sample_num = 10000
embedding_dic_path = '../Data/drug_vec_dic_BioSentVec_sent.pickle'

generate_data(max_sample_num, input_sequence_length, df)