In [3]:
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("data/train_features.csv", index_col="sig_id")
test_data = pd.read_csv("data/test_features.csv", index_col="sig_id")
targets = pd.read_csv("data/train_targets_scored.csv", index_col="sig_id")

In [5]:
targets.columns

Index(['5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',
       'acat_inhibitor', 'acetylcholine_receptor_agonist',
       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',
       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',
       'adenylyl_cyclase_activator', 'adrenergic_receptor_agonist',
       ...
       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
      dtype='object', length=206)

In [6]:
data.shape

(23814, 875)

In [7]:
targets.shape

(23814, 206)

In [8]:
test_data.shape

(3982, 875)

In [9]:
data["cp_type"].unique()

array(['trt_cp', 'ctl_vehicle'], dtype=object)

In [10]:
data["cp_dose"].unique()

array(['D1', 'D2'], dtype=object)

In [11]:
cat_f = ["cp_type", "cp_time", "cp_dose"]
oh = OneHotEncoder()
cats = oh.fit_transform(data[cat_f].values).toarray()

In [12]:
non_cats = data.columns[3:]

In [13]:
train_X = np.concatenate([data[non_cats].values, cats], 1)

In [14]:
train_X.shape

(23814, 879)

In [15]:
test_cats = oh.transform(test_data[cat_f].values).toarray()

In [16]:
test_X = np.concatenate([test_data[non_cats].values, test_cats], 1)

In [17]:
test_X.shape

(3982, 879)

In [18]:
with open("oh.pkl", "wb") as out:
    pkl.dump(oh, out)

In [19]:
train_ind, test_ind = train_test_split(np.arange(train_X.shape[0]))

In [20]:
train_ind.shape

(17860,)

In [21]:
test_ind.shape

(5954,)

In [22]:
np.save("train_X.npy", train_X[train_ind])
np.save("train_Y.npy", targets.values[train_ind])
np.save("val_X.npy", train_X[test_ind])
np.save("val_Y.npy", targets.values[test_ind])
np.save("testing_X.npy", test_X)