In [2]:
import numpy as np 
import pandas as pd 

In [3]:
import sys
sys.path.insert(0, '/csc/epitkane/projects/multimodal/src/')
from annotate_mutations import read_codes

In [4]:
import torch

## Creating dict file for onehot tokenisation

In [5]:
file = pd.read_csv('/csc/epitkane/projects/multimodal/extfiles/mutation_codes_sv.tsv', sep=' ', header=None)    

In [6]:
mut_list = file.iloc[0:31,2].values.tolist()
mut_list = ['A', 'C', 'T', 'G'] + mut_list


In [7]:
unique_list = []
for x in mut_list: 
    if x not in unique_list:
        unique_list.append(x)

In [8]:
unique_list

['A',
 'C',
 'T',
 'G',
 '!',
 '@',
 '#',
 'N',
 '1',
 '$',
 '%',
 '^',
 '2',
 '&',
 '*',
 '~',
 '3',
 ':',
 ';',
 '?',
 '4',
 '5',
 '6',
 '7',
 '8']

In [9]:
muts = pd.DataFrame({'mutation': unique_list, 'token': range(0,len(unique_list))})

In [10]:
muts

Unnamed: 0,mutation,token
0,A,0
1,C,1
2,T,2
3,G,3
4,!,4
5,@,5
6,#,6
7,N,7
8,1,8
9,$,9


In [None]:
muts.to_csv('one_hot_mutationdict.tsv', sep='\t', index=False)

## One-hot encoding

In [11]:
def one_hot(arr, token_size):
    encoded_arr = np.zeros((arr.size, token_size), dtype=int)
    encoded_arr[np.arange(arr.size),arr] = 1
    return encoded_arr

In [13]:
arr = np.array([4,7,2,9])
one = one_hot(arr, 25)

In [14]:
one

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

In [15]:
file = pd.read_csv('/csc/epitkane/projects/multimodal/data/temp/muat_orig/Bone-Osteosarc/f82d213f-9ba5-7b6b-e040-11ac0c486882/indel_f82d213f-9ba5-7b6b-e040-11ac0c486882.tsv.gz', compression='gzip', sep='\t')

In [16]:
file

Unnamed: 0,chrom,pos,ref,alt,sample,seq,ref_seq,gc1kb,genic,exonic,strand,histology
0,6,12690441,CA,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,C43,CTG,,0.0,0.0,=,Bone-Osteosarc
1,3,84137221,-,A,f82d213f-9ba5-7b6b-e040-11ac0c486882,G5A,G-A,,0.0,0.0,=,Bone-Osteosarc
2,3,157943529,A,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,T1A,TAA,,1.0,0.0,-,Bone-Osteosarc
3,10,127942076,TCT,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,424,TCT,,1.0,0.0,-,Bone-Osteosarc
4,16,65765482,AG,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,T24,TCT,,0.0,0.0,=,Bone-Osteosarc
...,...,...,...,...,...,...,...,...,...,...,...,...
355,5,116369616,TTTACTA,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,412,TAC,,0.0,0.0,=,Bone-Osteosarc
356,15,31814672,CA,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,C43,CTG,,1.0,0.0,+,Bone-Osteosarc
357,5,37180473,TCTT,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,244,CTT,,1.0,0.0,-,Bone-Osteosarc
358,18,5144293,T,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,A4C,ATC,,0.0,0.0,=,Bone-Osteosarc


In [69]:
seqs = file.loc[:,'seq']

In [74]:
len(seqs)

360

In [82]:
mutation_tokens = pd.read_csv('/csc/epitkane/projects/multimodal/extfiles/one_hot_mutationdict.tsv', sep='\t', header=0)

In [83]:
mutation_tokens

Unnamed: 0,mutation,token
0,A,0
1,C,1
2,T,2
3,G,3
4,!,4
5,@,5
6,#,6
7,N,7
8,1,8
9,$,9


In [84]:
motif_len = 3
max_token = 25
encoded = np.zeros((len(seqs), motif_len, max_token))
for i, s in enumerate(seqs.values.tolist()):
    motif_seq = pd.DataFrame({'sequence': list(str(s))})
    motif_seq = motif_seq.merge(mutation_tokens, left_on='sequence', right_on='mutation', how='left')
    #print(motif_seq.loc[:,'token'].values)
    #print(motif_seq.loc[:,'sequence'].values)
    encoded[i,:, :] = one_hot(motif_seq.loc[:,'token'].values, max_token)


In [85]:
encoded

array([[[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]])

In [65]:
print(seqs[0])
print(ready.shape)

C43
(360, 3, 25)


## Embedding with linear layer

In [7]:
import torch

In [8]:
data = np.load("/csc/epitkane/projects/multimodal/data/train/onehot3_tar/Bone-Osteosarc/f82d213f-9843-28eb-e040-11ac0d483e48/SNV_f82d213f-9843-28eb-e040-11ac0d483e48.npz")
SNV_motif = data["motif"]
data_sample_snv = np.random.choice(len(SNV_motif), size = 10, replace=False)
SNV_motif = SNV_motif[data_sample_snv, :, :]

In [9]:
SNV_motif = torch.FloatTensor(SNV_motif)

#### Linear

In [10]:
ln = torch.nn.Sequential(torch.nn.Flatten(start_dim=1, end_dim=-1),
                   torch.nn.Linear(3 * 25, 512))

In [14]:
result = ln(SNV_motif)

In [15]:
result.shape

torch.Size([10, 512])

In [16]:
torch.unsqueeze(result, 0).shape

torch.Size([1, 10, 512])

#### Convolution

In [10]:
SNV_motif.shape

torch.Size([10, 3, 25])

In [11]:
data = torch.flatten(SNV_motif, 1, -1)

In [16]:
data = data.unsqueeze(1)
data.shape

torch.Size([10, 1, 75])

In [29]:
conv = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5, bias=False))

In [30]:
A = conv(data)

In [31]:
A.shape

torch.Size([10, 1, 71])

In [32]:
orig_data_file = pd.read_csv('/csc/epitkane/projects/multimodal/data/train/DNABERT_motif3/Bone-Osteosarc/f82d213f-9ba5-7b6b-e040-11ac0c486882/indel_f82d213f-9ba5-7b6b-e040-11ac0c486882.tsv.gz', sep='\t', compression='gzip')
seq = orig_data_file.loc[:, 'seq']
ref = orig_data_file.loc[:, 'ref']

In [35]:
A = np.array([1,5])

In [37]:
seq.iloc[A]

1    CT
5    TG
Name: seq, dtype: object

In [39]:
orig_data_file

Unnamed: 0.1,Unnamed: 0,chrom,pos,ref,alt,sample,seq,ref_seq,gc1kb,genic,exonic,strand,histology
0,0,1,6016057,C,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,TC,TCC,,1.0,0.0,-,Bone-Osteosarc
1,1,1,8061059,TT,--,f82d213f-9ba5-7b6b-e040-11ac0c486882,CT,CTTT,,0.0,0.0,=,Bone-Osteosarc
2,2,1,28869918,CT,--,f82d213f-9ba5-7b6b-e040-11ac0c486882,GG,GCTG,,0.0,0.0,=,Bone-Osteosarc
3,3,1,33250566,TT,--,f82d213f-9ba5-7b6b-e040-11ac0c486882,CG,CTTG,,1.0,0.0,-,Bone-Osteosarc
4,4,1,39237324,G,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,AG,AGG,,0.0,0.0,=,Bone-Osteosarc
...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,373,X,123757710,TC,--,f82d213f-9ba5-7b6b-e040-11ac0c486882,AT,ATCT,,1.0,0.0,-,Bone-Osteosarc
374,374,X,126291383,ATAA,----,f82d213f-9ba5-7b6b-e040-11ac0c486882,GA,GATAAA,,0.0,0.0,=,Bone-Osteosarc
375,375,X,142155972,T,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,AT,ATT,,0.0,0.0,=,Bone-Osteosarc
376,376,X,144946862,T,-,f82d213f-9ba5-7b6b-e040-11ac0c486882,CA,CTA,,0.0,0.0,=,Bone-Osteosarc


In [16]:
B = np.load('/csc/epitkane/projects/multimodal/data/train/DNABERT_motif3/Uterus-AdenoCA/f7187888-f261-4a39-99b4-966fd7207117/indel_f7187888-f261-4a39-99b4-966fd7207117.npz')

In [17]:
B['motif'].shape

(12569, 1536)

In [5]:
df_data = pd.read_csv("/csc/epitkane/projects/multimodal/data/train_new/muat_orig/Bone-Osteosarc/f221c897-6ad0-0df9-e040-11ac0c4813ef/indel_f221c897-6ad0-0df9-e040-11ac0c4813ef.tsv.gz", compression='gzip', sep='\t', index_col= 0)
df_101 = pd.read_csv("/csc/epitkane/projects/multimodal/data/temp/DNABERT_motif201/Bone-Osteosarc/f221c897-6ad0-0df9-e040-11ac0c4813ef/indel_f221c897-6ad0-0df9-e040-11ac0c4813ef.tsv.gz", compression='gzip', sep='\t', index_col= 0)

In [20]:
df_data = df_data.astype({'chrom': str})
df_101 = df_101.astype({'chrom': str})
df_data = df_data.astype({'pos': str})
df_101 = df_101.astype({'pos': str})

df_data = df_data.drop_duplicates(ignore_index=True)
df_101 = df_101.drop_duplicates(ignore_index=True)

df_data['original_indices'] = df_data.index
#print(df_101.dtypes)
#print(df_data.dtypes)
df_filtered = pd.merge(df_data, df_101, on = ['chrom', 'pos'], how='inner')
#df_filtered = df_filtered.drop_duplicates(subset=['original_indices'])
removed_indices = df_data[~df_data.loc[:,'original_indices'].isin(df_filtered.loc[:,'original_indices'])].index.tolist()

In [21]:
len(df_101)

170

In [22]:
len(df_data)

173

In [23]:
len(df_filtered)

171

In [63]:
result = pd.merge(df_data, df_101, on = ['chrom', 'pos'], how='inner')

In [64]:
df_101

Unnamed: 0,chrom,pos,ref,alt,sample,seq,ref_seq,gc1kb,genic,exonic,strand,histology
0,1,8739392,C,-,f221c897-6ad0-0df9-e040-11ac0c4813ef,CTTAGGAAAGACAGTGAGACAGTCCCATAGCAATCTAAGGCTTTAT...,CTTAGGAAAGACAGTGAGACAGTCCCATAGCAATCTAAGGCTTTAT...,,1.0,0.0,-,Bone-Osteosarc
1,1,44519130,-,G,f221c897-6ad0-0df9-e040-11ac0c4813ef,CCCATGAGGCCTGTGTCCCCACTTTGAAGCCTCTCCAGTCCCCTAG...,CCCATGAGGCCTGTGTCCCCACTTTGAAGCCTCTCCAGTCCCCTAG...,,0.0,0.0,=,Bone-Osteosarc
2,1,45684970,C,-,f221c897-6ad0-0df9-e040-11ac0c4813ef,GCCATTGCACTGACTTCTTCTTTTCTTTCTCTAAAGATCACAGCAT...,GCCATTGCACTGACTTCTTCTTTTCTTTCTCTAAAGATCACAGCAT...,,1.0,0.0,-,Bone-Osteosarc
3,1,62682663,---,TTT,f221c897-6ad0-0df9-e040-11ac0c4813ef,TCGAGTTAAATCTTATGTGATTGCAAGCACAATTAAAAATAAAAAT...,TCGAGTTAAATCTTATGTGATTGCAAGCACAATTAAAAATAAAAAT...,,0.0,0.0,=,Bone-Osteosarc
4,1,62683559,-,T,f221c897-6ad0-0df9-e040-11ac0c4813ef,GTAGCAGAGAATTACAGCCTCATGCCACTACTGCCCAGCTAATTTT...,GTAGCAGAGAATTACAGCCTCATGCCACTACTGCCCAGCTAATTTT...,,0.0,0.0,=,Bone-Osteosarc
...,...,...,...,...,...,...,...,...,...,...,...,...
169,X,97691866,T,-,f221c897-6ad0-0df9-e040-11ac0c4813ef,ACGTGCTGACTTTTTGGGCATCTCATTTGACTGTCATACCTTCTTA...,ACGTGCTGACTTTTTGGGCATCTCATTTGACTGTCATACCTTCTTA...,,0.0,0.0,=,Bone-Osteosarc
170,X,114649832,AGGTACCACATCA,-------------,f221c897-6ad0-0df9-e040-11ac0c4813ef,AAGTCTGAGAGTTTATTCTTAGTTATAGTTAAAATACTTACATATG...,AAGTCTGAGAGTTTATTCTTAGTTATAGTTAAAATACTTACATATG...,,0.0,0.0,=,Bone-Osteosarc
171,X,139060093,A,-,f221c897-6ad0-0df9-e040-11ac0c4813ef,CAATGTATATCAAAACATCACATTGTACACTTTAAATATATATAAT...,CAATGTATATCAAAACATCACATTGTACACTTTAAATATATATAAT...,,0.0,0.0,=,Bone-Osteosarc
172,X,144222388,AG,-T,f221c897-6ad0-0df9-e040-11ac0c4813ef,AAACACTTCTTTTCATAACATATCACTTGCTAAGGAGGCTGTGACA...,AAACACTTCTTTTCATAACATATCACTTGCTAAGGAGGCTGTGACA...,,0.0,0.0,=,Bone-Osteosarc


In [37]:
jee = df_data.drop_duplicates(subset=['chrom', 'pos'],ignore_index=True)

In [41]:
len(jee)

172

In [38]:
joo = df_101.drop_duplicates(subset=['chrom', 'pos'],ignore_index=True)

In [42]:
len(joo)

170

In [39]:
result = pd.merge(jee, joo, on = ['chrom', 'pos'], how='inner')

In [40]:
len(result)

170

In [35]:
df_101[df_101.loc[:,['chrom', 'pos']].duplicated()]

Unnamed: 0,chrom,pos,ref,alt,sample,seq,ref_seq,gc1kb,genic,exonic,strand,histology


In [36]:
df_data.iloc[76:78, :]

Unnamed: 0,chrom,pos,ref,alt,sample,seq,ref_seq,gc1kb,genic,exonic,strand,histology,original_indices
76,7,156626760,TA,,f221c897-6ad0-0df9-e040-11ac0c4813ef,241,CTA,,1.0,0.0,+,Bone-Osteosarc,76
77,7,156626760,TAGTACAACCTTAGGACCACTAC,GTACTA,f221c897-6ad0-0df9-e040-11ac0c4813ef,241,CTA,,1.0,0.0,+,Bone-Osteosarc,77


In [27]:
result[result.duplicated()]

Unnamed: 0,chrom,pos


In [16]:
df_filtered.iloc[77:80, :]

Unnamed: 0,chrom,pos,ref_x,alt_x,sample_x,seq_x,ref_seq_x,gc1kb_x,genic_x,exonic_x,...,ref_y,alt_y,sample_y,seq_y,ref_seq_y,gc1kb_y,genic_y,exonic_y,strand_y,histology_y
77,7,156626760,TA,,f221c897-6ad0-0df9-e040-11ac0c4813ef,241,CTA,,1.0,0.0,...,GTAGTGGTCCTAAGGTTGTACTA,-----------------------,f221c897-6ad0-0df9-e040-11ac0c4813ef,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,,1.0,0.0,+,Bone-Osteosarc
78,7,156626760,TAGTACAACCTTAGGACCACTAC,GTACTA,f221c897-6ad0-0df9-e040-11ac0c4813ef,241,CTA,,1.0,0.0,...,GTAGTGGTCCTAAGGTTGTACTA,-----------------------,f221c897-6ad0-0df9-e040-11ac0c4813ef,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,,1.0,0.0,+,Bone-Osteosarc
79,7,156626760,TAGTACAACCTTAGGACCACTAC,GTACTA,f221c897-6ad0-0df9-e040-11ac0c4813ef,241,CTA,,1.0,0.0,...,GTAGTGGTCCTAAGGTTGTACTA,-----------------------,f221c897-6ad0-0df9-e040-11ac0c4813ef,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,TTTCTGAATTTTAATTTTATAGTAACTTTTGTTGGCTGTTATCACC...,,1.0,0.0,+,Bone-Osteosarc


In [14]:
jee.iloc[75:80,:]

Unnamed: 0,chrom,pos
75,7,144492324
76,7,156626760
77,7,156626760
78,7,156626760
79,7,156626760


In [70]:
df_data.iloc[76,:]

chrom                                                  7
pos                                            156626760
ref                                                   TA
alt                                                  NaN
sample              f221c897-6ad0-0df9-e040-11ac0c4813ef
seq                                                  241
ref_seq                                              CTA
gc1kb                                                NaN
genic                                                1.0
exonic                                               0.0
strand                                                 +
histology                                 Bone-Osteosarc
original_indices                                      76
Name: 76, dtype: object

In [73]:
df_data.iloc[77,:]

chrom                                                  7
pos                                            156626768
ref                                      CCTTAGGACCACTAC
alt                                                  NaN
sample              f221c897-6ad0-0df9-e040-11ac0c4813ef
seq                                                  312
ref_seq                                              GAC
gc1kb                                                NaN
genic                                                1.0
exonic                                               0.0
strand                                                 -
histology                                 Bone-Osteosarc
original_indices                                      78
Name: 78, dtype: object

In [72]:
df_data = df_data.drop(77)

In [75]:
df_data.to_csv("/csc/epitkane/projects/multimodal/data/temp/muat_orig_exclude_regions/Bone-Osteosarc/f221c897-6ad0-0df9-e040-11ac0c4813ef/indel_f221c897-6ad0-0df9-e040-11ac0c4813ef.tsv.gz", compression='gzip', sep='\t')

In [52]:
df_data.iloc[77,:]

chrom                                                  7
pos                                            156626768
ref                                      CCTTAGGACCACTAC
alt                                                  NaN
sample              f221c897-6ad0-0df9-e040-11ac0c4813ef
seq                                                  312
ref_seq                                              GAC
gc1kb                                                NaN
genic                                                1.0
exonic                                               0.0
strand                                                 -
histology                                 Bone-Osteosarc
original_indices                                      78
Name: 78, dtype: object

In [53]:
array = np.load("/csc/epitkane/projects/multimodal/data/temp/muat_orig_exclude_regions/Bone-Osteosarc/f221c897-6ad0-0df9-e040-11ac0c4813ef/indel_f221c897-6ad0-0df9-e040-11ac0c4813ef.npz")

In [56]:
fmotif = np.delete(array['motif'], 77, axis=0)
fposition = np.delete(array['position'], 77, axis=0)
fGES = np.delete(array['GES'], 77, axis=0)

In [57]:
 np.savez_compressed("/csc/epitkane/projects/multimodal/data/temp/muat_orig_exclude_regions/Bone-Osteosarc/f221c897-6ad0-0df9-e040-11ac0c4813ef/indel_f221c897-6ad0-0df9-e040-11ac0c4813ef.npz", motif=fmotif, position=fposition, GES = fGES)