### Prepare data for SOMA

In [1]:
import pandas as pd
df = pd.read_csv('SOMA_example_input.csv')
df.head()

Unnamed: 0,index_offset,psi,seq
0,ENSG00000000003.15;TSPAN6;chrX-100632484-10063...,7.496831,CTTCGACACCGAGCTCGATATGATCGAAGTATTTATTACCATAAAG...
1,ENSG00000000003.15;TSPAN6;chrX-100633930-10063...,9.633673,GCTTCGACACCGAGCTCGTCGAGAACTTATTTGACCTGAAACCAAA...
2,ENSG00000000003.15;TSPAN6;chrX-100635177-10063...,1.012797,GCTTCGACACCGAGCTCGAGACGACCATTATTTTTTCTTTGACTCC...
3,ENSG00000000419.14;DPM1;chr20-50945736-5094576...,2.55393,TGAGATTGAATCCAGGAAATGAAGCTTCGACACCGAGCTCGTTAGC...
4,ENSG00000000419.14;DPM1;chr20-50948628-5094866...,-2.114327,CTTCGACACCGAGCTCGGTGCAACTATATTTCTATTAAAGTGAGTA...


In [2]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

### Train SOMA model

In [None]:
from SPICE import Soma
# This is a quick example, so we use only 20 epochs and a single random seed. 
# For real applications, use more epochs and multiple seeds. Suggest epochs>=100 and num_seeds=10.
# If you have a GPU, training will be much faster by specifying device='cuda'.
Soma.train(df_train, device='cuda', epochs=20, batch_size=512, learning_rate=1e-4, num_seeds=1)

Training: 100%|██████████| 20/20 [03:17<00:00,  9.87s/epoch, train_loss=14.0551]

Training completed for seed 0. Saving model...





### Predict PSI from sequence

In [4]:
from scipy.stats import spearmanr
params_file = f'SOMA_params_seed_0.pth'
pred_psi = Soma.predict(df_test, device='cuda', batch_size=512, params=params_file)
df_test['pred_psi'] = pred_psi
corr, _ = spearmanr(df_test['psi'], df_test['pred_psi'])
print(f'Spearman correlation: {corr}')

100%|██████████| 9/9 [00:00<00:00, 24.58it/s]

Spearman correlation: 0.6743766026113802



