In [1]:
import Phosformer

# load Phosformer model and tokenizer
model = Phosformer.RobertaForSequenceClassification.from_pretrained('waylandy/phosformer')
tokenizer = Phosformer.RobertaTokenizer.from_pretrained('waylandy/phosformer')

# disables dropout for deterministic results
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(95, 768, padding_idx=1)
      (position_embeddings): Embedding(4000, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [2]:
# Provide the kinase domain sequence 
kinase_sequence  = 'YTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYL'

# Provide the peptide sequence
peptide_sequence = 'LLKLASPELER'

# Run the prediction
Phosformer.predict_one(kinase_sequence, peptide_sequence, model=model, tokenizer=tokenizer)

0.67370486

In [3]:
import pandas as pd

# Load the included csv file containing kinase domain sequences
kinase_csv       = pd.read_csv('data/reference_human_kinases.csv')
# Retrieve the kinase domain sequence from the csv based on UniProt
kinase_sequence  = kinase_csv[kinase_csv['uniprot']=='P28482']['sequence'].item()

# Provide the peptide sequence
peptide_sequence = 'LLKLASPELER'

# Run the prediction
Phosformer.predict_one(kinase_sequence, peptide_sequence, model=model, tokenizer=tokenizer)


0.67370486

In [4]:
import pandas as pd

# load the example dataset
kinases                 = pd.read_csv('data/example_input_peptide.csv')

# only these two columns are needed to make predictions
kinase_sequences_list   = kinases['kinase domain sequence'].values
peptide_sequences_list  = kinases['peptide sequence'].values

# make predictions
predictions = Phosformer.predict_many(
    kinase_sequences_list,
    peptide_sequences_list,
    model=model,
    tokenizer=tokenizer,
    batch_size=20, # how many samples to load at once, if you're running out of memory, you can set this number lower
    device='cpu',  # either "cpu" or "cuda"
    threads=1      # specify how many threads to use, can help speed up if running on cpu
)

# add predictions to the dataset
kinases['score'] = predictions
kinases['prediction'] = ['Yes' if i >= 0.5 else 'No' for i in predictions]

# save the results into a new table
results = kinases[['kinase name','peptide sequence','score','prediction']]
# results.to_csv('output.csv', index=False) # uncomment this line if you want to save results
display(results)

Unnamed: 0,kinase name,peptide sequence,score,prediction
0,CMGC:MAPK:MK01,LLKLASPELER,0.673705,Yes
1,CMGC:MAPK:MK01,SPSVCSPLNMT,0.544702,Yes
2,CMGC:MAPK:MK01,SNIPETPPPGY,0.606684,Yes
3,CMGC:MAPK:MK01,ASGPESLDGAA,0.122489,No
4,CMGC:MAPK:MK01,SQPNSSKQTVL,0.087166,No
5,AGC:PKA:KAPCA,ALRRNSDR---,0.676278,Yes
6,AGC:PKA:KAPCA,MTRRNTYVCSE,0.702793,Yes
7,AGC:PKA:KAPCA,NIRKDTFFLTV,0.504628,Yes
8,AGC:PKA:KAPCA,TYISETDEDDD,0.029953,No
9,AGC:PKA:KAPCA,DGLLASPDLGL,0.142883,No
