This notebook converts the CellLine dataset used for NCMF into a format that can be used by DCMF.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_folder = "../../datasets/NCMF/"
dataset_name = "CellLine"
sample_no = 3
target_link_type = 3

In [3]:
node_file = f'sampled{sample_no}_node.dat'
link_file = f'sampled{sample_no}_link.dat'
link_test_file = f'sampled{sample_no}_link.dat.test'
label_file = f'sampled{sample_no}_label.dat'
label_test_file = f'sampled{sample_no}_label.dat.test'
meta_file = f'sampled{sample_no}_meta.dat'
info_file = f'sampled{sample_no}_info.dat'
record_file = f'sampled{sample_no}_record.dat'

In [4]:
cellline_count = 692
genes_count = 638
drugs_count = 396

In [5]:
# Creating matrix from link.dat file 

In [6]:
link_train_df = pd.read_csv(data_folder + dataset_name + "/" + link_file, sep = "\t", header = None)
link_train_df.columns = ["left", "right", "link_type", "link_value"]
link_train_df.head()

Unnamed: 0,left,right,link_type,link_value
0,0,722,0,1.0
1,0,835,0,1.0
2,0,974,0,1.0
3,0,1066,0,1.0
4,0,1068,0,1.0


In [7]:
link_train_df.tail()

Unnamed: 0,left,right,link_type,link_value
872365,691,1719,3,0.169045
872366,691,1720,3,0.043135
872367,691,1721,3,0.196274
872368,691,1722,3,0.024832
872369,691,1724,3,0.024583


In [8]:
link_train_df0 = link_train_df[link_train_df["link_type"] == 0]
link_train_df1 = link_train_df[link_train_df["link_type"] == 1]
link_train_df2 = link_train_df[link_train_df["link_type"] == 2]
link_train_df3 = link_train_df[link_train_df["link_type"] == 3]

In [9]:
link_train_df0.head()

Unnamed: 0,left,right,link_type,link_value
0,0,722,0,1.0
1,0,835,0,1.0
2,0,974,0,1.0
3,0,1066,0,1.0
4,0,1068,0,1.0


In [10]:
link_train_df0.tail()

Unnamed: 0,left,right,link_type,link_value
19977,691,1128,0,1.0
19978,691,1267,0,1.0
19979,691,1275,0,1.0
19980,691,1282,0,1.0
19981,691,1299,0,1.0


In [11]:
mut_np = np.zeros((cellline_count, genes_count))
mut_np.shape

(692, 638)

In [12]:
for idx, v in link_train_df0.iterrows():
    mut_np[int(v["left"])][int(v["right"]) - cellline_count] = v["link_value"]

In [13]:
cnv_np = np.zeros((cellline_count, genes_count))
print(cnv_np[1])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [14]:
link_train_df1.head()

Unnamed: 0,left,right,link_type,link_value
19982,0,692,1,1.0
19983,0,693,1,1.0
19984,0,694,1,1.0
19985,0,697,1,-1.0
19986,0,698,1,1.0


In [15]:
for idx, v in link_train_df1.iterrows():
    cnv_np[int(v["left"])][int(v["right"]) - cellline_count] = v["link_value"]
print(cnv_np[1])

[ 0.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0. -1. -1.  0.
  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0. -1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0. -1. -1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0. -1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  0. -1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.  1.  0.  0.
 -1.  1.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0

In [16]:
gene_exp_np = np.zeros((cellline_count, genes_count))
print(gene_exp_np[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [17]:
link_train_df2.head()

Unnamed: 0,left,right,link_type,link_value
232761,0,692,2,5.140779
232762,0,693,2,2.998196
232763,0,694,2,11.418643
232764,0,695,2,3.434295
232765,0,696,2,4.670161


In [18]:
for idx, v in link_train_df2.iterrows():
    gene_exp_np[int(v["left"])][int(v["right"]) - cellline_count] = v["link_value"]
print(gene_exp_np[0])

[ 5.14077866  2.9981955  11.41864325  3.43429462  4.67016051  2.91647664
  4.51349075  5.92908084  6.50239426  8.65710404  2.42760617  1.40599236
  2.7548875   0.37851162  2.49825087  5.61382629  4.07895134  0.
  2.8738132   4.62819033  4.3305584   3.45285896  5.09423607  3.90400232
  3.64385619  3.06867081  2.45154083  0.12432814  2.7441611   4.10601324
  2.69376571  3.91073266  5.86319511  3.5360529   4.86542398  3.6123525
  6.75234719  6.08193608  5.17072628  5.30487605  5.25436728  4.30378075
  6.75368476  5.9590746   3.63923216  1.62293035  1.28095631  9.82875533
  5.77689357  1.01435529  5.26903315  2.592158    2.22342255  3.31469653
  0.          0.83187724  7.11779892  3.36597243  3.09423607  4.91456452
  3.51222689  3.93545975  4.53106949  0.48542683  2.64385619  4.76022095
  3.05311134  3.88655015  2.50334874  4.30669971  3.31614574  3.20320116
  0.31034012  9.47664496  0.40053793  4.15055968  4.95559179  5.29351763
  3.32192809  1.22650853  7.44683518  0.05658353  4.33842441

In [19]:
link_train_df3

Unnamed: 0,left,right,link_type,link_value
653144,0,1330,3,0.221568
653145,0,1331,3,0.048679
653146,0,1332,3,0.159713
653147,0,1333,3,0.351311
653148,0,1335,3,0.063590
...,...,...,...,...
872365,691,1719,3,0.169045
872366,691,1720,3,0.043135
872367,691,1721,3,0.196274
872368,691,1722,3,0.024832


In [20]:
audrc_np = np.zeros((cellline_count, drugs_count))
for idx, v in link_train_df3.iterrows():
    audrc_np[int(v["left"])][int(v["right"]) - cellline_count - genes_count] = v["link_value"]
print(audrc_np[0])

[0.221568  0.048679  0.159713  0.3513112 0.        0.06359   0.
 0.597878  0.3404    0.164097  0.051614  0.        0.        0.015288
 0.015264  0.016689  0.020047  0.50838   0.052735  0.015267  0.019031
 0.492478  0.        0.075912  0.        0.015264  0.        0.12433
 0.028404  0.244407  0.018518  0.        0.015264  0.        0.
 0.        0.0744356 0.078227  0.051878  0.031134  0.025371  0.
 0.045952  0.225517  0.        0.281124  0.168014  0.164253  0.05078
 0.027791  0.082758  0.132223  0.        0.        0.489789  0.020327
 0.020155  0.016595  0.217188  0.491202  0.        0.099333  0.117592
 0.11653   0.250651  0.026004  0.027015  0.029272  0.036893  0.247583
 0.        0.101137  0.023246  0.040416  0.015324  0.044405  0.
 0.047098  0.016411  0.017802  0.        0.018969  0.087263  0.234846
 0.17954   0.02762   0.0738166 0.        0.        0.        0.0921958
 0.        0.05995   0.218343  0.009264  0.023891  0.015264  0.0185304
 0.038635  0.037053  0.016356  0.        0.5

In [21]:
audrc_test_np = np.zeros((cellline_count, drugs_count))
link_test_df = pd.read_csv(data_folder + dataset_name + "/" + link_test_file, sep = "\t", header = None)
link_test_df.columns = ["left", "right", "link_value"]
link_test_df.head()

Unnamed: 0,left,right,link_value
0,598,1375,0.380364
1,454,1371,0.312764
2,248,1572,0.209578
3,0,1651,0.115166
4,468,1508,0.005335


In [22]:
for idx, v in link_test_df.iterrows():
    audrc_test_np[int(v["left"])][int(v["right"]) - cellline_count - genes_count] = v["link_value"]

In [23]:
# Rdoublets creation
Rdoublets = pd.DataFrame(columns = ["left", "right"])
Rdoublets["left"] = link_train_df3["left"].copy()
Rdoublets["left"] = Rdoublets["left"]
Rdoublets["right"] = link_train_df3["right"].copy()
Rdoublets["right"] = Rdoublets["right"] - cellline_count - genes_count

In [24]:
Rdoublets.head()

Unnamed: 0,left,right
653144,0,0
653145,0,1
653146,0,2
653147,0,3
653148,0,5


In [25]:
# getting traing and test idx
link_train_df3["indices"] = link_train_df3.apply(lambda row: int((row["left"]) * drugs_count + (row["right"] - cellline_count - genes_count)), axis = 1)
link_train_df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,left,right,link_type,link_value,indices
653144,0,1330,3,0.221568,0
653145,0,1331,3,0.048679,1
653146,0,1332,3,0.159713,2
653147,0,1333,3,0.351311,3
653148,0,1335,3,0.06359,5


In [26]:
# getting traing and test idx
link_test_df["indices"] = link_test_df.apply(lambda row: int((row["left"]) * drugs_count + (row["right"] - cellline_count - genes_count)), axis = 1)
link_test_df.head()

Unnamed: 0,left,right,link_value,indices
0,598,1375,0.380364,236853
1,454,1371,0.312764,179825
2,248,1572,0.209578,98450
3,0,1651,0.115166,321
4,468,1508,0.005335,185506


In [27]:
link_test_df.tail()

Unnamed: 0,left,right,link_value,indices
54801,321,1700,0.013843,127486
54802,168,1473,0.773214,66671
54803,349,1717,0.419108,138591
54804,3,1445,0.102049,1303
54805,300,1676,0.159679,119146


In [28]:
# Saving matrices
import pickle

In [29]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_13_train_fold_1.pkl", "wb") as f:
    pickle.dump(audrc_np, f)

In [30]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_13_test_fold_1.pkl", "wb") as f:
    pickle.dump(audrc_test_np, f)

In [31]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_13_train_idx_1.pkl", "wb") as f:
    pickle.dump(link_train_df3["indices"].values, f)

In [32]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_13_test_idx_1.pkl", "wb") as f:
    pickle.dump(link_test_df["indices"].values, f)

In [33]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_mut.pkl", "wb") as f:
    pickle.dump(mut_np, f)

In [34]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_cnv.pkl", "wb") as f:
    pickle.dump(cnv_np, f)

In [35]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "X_12_gene_exp.pkl", "wb") as f:
    pickle.dump(gene_exp_np, f)

In [36]:
with open(f"../data/{dataset_name}/sample{sample_no}/" + "R_doublets_1.pkl", "wb") as f:
    pickle.dump(Rdoublets.values, f)