In [1]:
from utils import *
import warnings
warnings.filterwarnings('ignore')

**UniProt Search**

In [17]:
entrez_ids = list([str(i) for i in gid2uid.keys()])
with open(f'{root}/mapping/gid_text.txt', 'w') as f:
    f.write(','.join(entrez_ids))

In [27]:
gid2protein_raw = pd.read_csv(f'{root}/mapping/gid2protein_raw.tsv', sep='\t')
gid2protein = gid2protein_raw.drop_duplicates(subset='From', keep='first')
gid2protein.to_csv(f'{root}/mapping/gid2protein.csv', index=False)

In [5]:
gid2protein = pd.read_csv(f'{root}/mapping/gid2protein.csv')
protein_coding_gids = set(gid2protein['From'])

**Cell-line Embeddings**

In [9]:
import re

cell_path = '/home/qingyuyang/SynergyX/data/0_cell_data/4079g/985_cellGraphs_exp_mut_cn_eff_dep_met_4079_genes_norm.npy'
cell_embedding = np.load(cell_path, allow_pickle=True).item()

cell_info = pd.read_csv('/home/qingyuyang/SynergyX/data/raw_data/cell_info.csv')
depmap_id = list(cell_info['depmap_id'])
cell_line_names = list(cell_info['cell_line_name'])
cell_line_names = [''.join(re.split(r'[ -]', i)).upper() for i in cell_line_names]
clname2id = dict(zip(cell_line_names, depmap_id))

clname2embed = {}
for cl_name, cl_id in clname2id.items():
    clname2embed[cl_name] = cell_embedding[cl_id]
np.save(f'{root}/mapping/clname2embed.npy', clname2embed, allow_pickle=True)

In [3]:
clname2embed = np.load(f'{root}/mapping/clname2embed.npy', allow_pickle=True).item()

**Preprocess SLKB**

In [2]:
sl_table_fp = os.path.join(root, 'SLKB/SLKB_original_scores.csv')

df = pd.read_csv(sl_table_fp)
df = df[['cell_line_origin', 'gene_1', 'gene_2', 'SL_or_not']]
df.replace({"SL_or_not": {'Not SL': 0, 'SL': 1}}, inplace=True)
df.columns = ['cell_line', 'g1', 'g2', 'label']

print(len(df), len(set(df['g1']) | set(df['g2'])) )
df.head()

280483 6125


Unnamed: 0,cell_line,g1,g2,label
0,RPE1,AKT1,AMBRA1,0
1,RPE1,AKT3,AMBRA1,0
2,RPE1,ARF6,AMBRA1,0
3,RPE1,ATF4,AMBRA1,0
4,RPE1,ATG10,AMBRA1,0


In [3]:
df = format_df(df)
df = df_gene2id(df)

print(len(df), len(set(df['0']) | set(df['1'])))
df.head()

253811 5906


Unnamed: 0,0,1,2,3
0,207,55626,0,RPE1
1,10000,55626,0,RPE1
2,382,55626,0,RPE1
3,468,55626,0,RPE1
4,83734,55626,0,RPE1


In [6]:
print("Gene num after alignment", len(set(protein_coding_gids).intersection(set(gid2uid.keys()))))

df = df[df['0'].isin(gid2uid.keys()) & df['1'].isin(gid2uid.keys())]
print("After 1st alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

df = df[df['0'].isin(protein_coding_gids) & df['1'].isin(protein_coding_gids)]
print("After 2nd alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

df = remove_rows_with_condition(df)  # NOTE: Remove those with contradicting labels!
print("After all alignment, SLKB has", len(df), "items and", len(set(df['0']) | set(df['1'])), "unique genes")

Gene num after alignment 9807
After 1st alignment, SLKB has 211688 items and 3024 unique genes
After 2nd alignment, SLKB has 211640 items and 3016 unique genes
After all alignment, SLKB has 211602 items and 3016 unique genes


In [22]:
df.to_csv(f'{root}/SLKB/SLKB_processed.csv', index=False)

In [7]:
df = pd.read_csv(f'{root}/SLKB/SLKB_processed.csv')

**Count Statistics**

In [2]:
root_path = f'{root}/SLKB'

table_save_path = f'{root_path}/cell_line/table'
os.makedirs(table_save_path, exist_ok=True)

fig_save_path = f'{root_path}/cell_line/gene_count'
os.makedirs(fig_save_path, exist_ok=True)

In [6]:
cell_lines = list(set(df['3']))
print('In total', len(cell_lines), 'cell lines')

In total 22 cell lines


In [7]:
column_names = ['cell_line', '# pairs', '# pos', '# neg', 'n/p ratio', '# genes', '# unique genes', 'cell_embed']
stat_df = pd.DataFrame(0, index=range(len(cell_lines)), columns=column_names)

for i in range(len(cell_lines)):
    cl = cell_lines[i]
    # print(cl)
    path = f'{table_save_path}/{cl}.csv'

    cl_df = select_cl_from_slkb(df, cl)
    # cl_df = remove_rows_with_condition(cl_df)
    cl_df.to_csv(path, index=False)

    total, pos, neg, ratio = count_pn_ratio(cl_df)
    gene_list, gene_set, gene_count = count_gene_freq(cl_df)
    unique_gene_num = list(gene_count.values()).count(1)

    if_cell_embed = 1 if cl in clname2embed else 0
    stat_df.iloc[i] = [cl, total, pos, neg, ratio, len(gene_set), unique_gene_num, if_cell_embed]

    visualize_gene_freq(gene_count, save_path=f'{fig_save_path}', title=cl)

stat_df.sort_values(by=["# unique genes", "# genes"], inplace=True, ascending=False, ignore_index=True)
stat_df.to_csv(f'{table_save_path}/overall_statistics.csv', index=False)
stat_df

Unnamed: 0,cell_line,# pairs,# pos,# neg,n/p ratio,# genes,# unique genes,cell_embed
0,MEWO,2538,279,2259,8.0968,2139,1138,1
1,MEL202,1952,372,1580,4.2473,1587,779,0
2,IPC298,1952,315,1637,5.1968,1587,779,1
3,PK1,1952,372,1580,4.2473,1587,779,0
4,GI1,1952,238,1714,7.2017,1587,779,0
5,HS936T,1952,338,1614,4.7751,1587,779,0
6,HS944T,1952,278,1674,6.0216,1587,779,0
7,PATU8988S,1952,331,1621,4.8973,1587,779,0
8,HSC5,1952,370,1582,4.2757,1587,779,0
9,MELJUSO,2230,279,1951,6.9928,1593,776,0


From the above table, we found that:
1. HT29 and 786O are unsuitable for SL prediction because of few positive samples;
2. Cell-line 15-19 has no 'tail' scenes;
3. Genes in cell-line 1-8 are the same, suitable for transfer scenes;
4. PC9's genes are all unique, so its C1, C2, and Tail are equivalent.

**Cut C1/C2/Tail scenes**

In [5]:
specific_save_path = f'{root_path}/specific'
os.makedirs(specific_save_path, exist_ok=True)

In [5]:
stat_df = pd.read_csv(f'{table_save_path}/overall_statistics.csv')
new_cell_lines = stat_df['cell_line'][:-2]

for cl in tqdm(['RPE1']):
    cl_df = pd.read_csv(f'{table_save_path}/{cl}.csv')

    # C1
    split_cv(cl_df, f'{specific_save_path}/{cl}/C1')

    # C2
    C2(cl_df, f'{specific_save_path}/{cl}/C2')

    # Tail
    tail(cl_df, f'{specific_save_path}/{cl}/Tail')

100%|██████████| 1/1 [00:00<00:00,  1.29it/s]


In [6]:
count_specific_statistics(specific_save_path)

60it [00:27,  2.18it/s]


Unnamed: 0,cell_line,scene,fold,train(1)/test(0),# pairs,# pos,# neg,n/p ratio,# genes,# unique genes
0,MEWO,C1,0,1,2030,232,1798,7.7500,1898,1077
1,MEWO,C1,0,0,508,47,461,9.8085,734,559
2,MEWO,C1,1,1,2030,222,1808,8.1441,1896,1102
3,MEWO,C1,1,0,508,57,451,7.9123,750,585
4,MEWO,C1,2,1,2030,210,1820,8.6667,1924,1117
...,...,...,...,...,...,...,...,...,...,...
525,22RV1,C2,2,0,315,82,233,2.8415,44,0
526,22RV1,C2,3,1,595,104,491,4.7212,35,0
527,22RV1,C2,3,0,315,88,227,2.5795,44,0
528,22RV1,C2,4,1,630,168,462,2.7500,36,0


**Split Negative/Positive Ratio (Optional)**

In [2]:
import random

def cut_pnratio_data(df: pd.DataFrame, factor: int) -> pd.DataFrame:
    """
    factor: the target ratio of negative samples over positive samples, can only be one of [1, 5, 20]
    """
    pos = df[df['2'] == 1].index.tolist()
    neg = df[df['2'] == 0].index.tolist()
    selected_neg = random.sample(neg, factor * len(pos))
    selected = pos + selected_neg
    new_df = df.loc[selected]
    return new_df

In [5]:
cell_lines_to_cut = ['K562', 'JURKAT']#'A549', 'A375', 
scenes = ['C1', 'C2']#, 'Tail'

for cl, scene in list(itertools.product(cell_lines_to_cut, scenes)):
    for fold in range(5):
        data_path = f'{root_path}/specific/{cl}/{scene}'
        original_df = pd.read_csv(f'{data_path}/sl_train_{fold}.csv')
        new_df_1 = cut_pnratio_data(original_df, 1)
        new_df_5 = cut_pnratio_data(original_df, 5)

        os.makedirs(f'{data_path}/1:1', exist_ok=True)
        new_df_1.to_csv(f'{data_path}/1:1/sl_train_{fold}.csv', index=False)
        os.makedirs(f'{data_path}/1:5', exist_ok=True)
        new_df_5.to_csv(f'{data_path}/1:5/sl_train_{fold}.csv', index=False)

**Form Transfer scenes**

In [3]:
import itertools

transfer_save_path = f'{root_path}/transfer'
os.makedirs(transfer_save_path, exist_ok=True)

cell_names = ['A549', 'GI1', 'MEL202']
combinations = list(itertools.combinations(cell_names, 2))
new_comb = [(i[1], i[0]) for i in combinations]
whole_comb = combinations + new_comb

for source, target in tqdm(whole_comb):
    output_dir = f'{transfer_save_path}/{source}_{target}'
    os.makedirs(output_dir, exist_ok=True)

    train_df = pd.read_csv(f'{table_save_path}/{source}.csv')
    train_df.to_csv(f'{output_dir}/sl_train_0.csv', index=False)
    test_df = pd.read_csv(f'{table_save_path}/{target}.csv')
    test_df.to_csv(f'{output_dir}/sl_test_0.csv', index=False)

100%|██████████| 6/6 [00:00<00:00, 45.95it/s]


**ESM4SL: can only deal with genes with protein sequence**

In [15]:
all_gene_set = list(set(df['0']) | set(df['1']))
all_id_seq = form_id_seq_list(all_gene_set, gid2protein)  # 3016 genes

with open(f'{root_path}/all_id_seq.pkl', 'wb') as f:
    pickle.dump(all_id_seq, f)

**SLGNNCT: change all genes back to name, to match its KG database**

In [21]:
id2name = {int(idx): name for name, idx in name2id.items()}

In [None]:
slgnnct_specific_save_path = f'{root_path}/slgnnct/specific'
os.makedirs(slgnnct_specific_save_path, exist_ok=True)

for cl in cell_lines:
    for scene in ['C1', 'C2', 'C3']:
        save_path = f'{slgnnct_specific_save_path}/{cl}/{scene}'
        os.makedirs(save_path, exist_ok=True)

        raw_path = f'{table_save_path}/{cl}/{scene}'
        for fn in os.listdir(raw_path):
            sl = pd.read_csv(f'{raw_path}/{fn}')
            old_len = len(sl)
            sl['0'] = sl['0'].map(id2name)
            sl['1'] = sl['1'].map(id2name)
            sl.dropna(inplace=True)
            new_len = len(sl)
            assert old_len == new_len
            sl.to_csv(f'{save_path}/{fn}', index=False)