### Check GPU hardware

In [1]:
!nvidia-smi

Thu Jun  8 04:49:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:48:00.0 Off |                    0 |
| N/A   30C    P0    62W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Install D-SCRIPT

In [2]:
!pip install dscript
import os
# Set OMP_NUM_THREADS to "1", limiting OpenMP to single-threaded operation
os.environ["OMP_NUM_THREADS"] = "1"

Collecting dscript
  Downloading dscript-0.2.4-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.8/121.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting biopython
  Downloading biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━

### Download trained models

In [3]:
# Download Human D-SCRIPT model 
!wget http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav

--2023-06-08 04:51:27--  http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.30.2.148
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.30.2.148|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2542758 (2.4M)
Saving to: ‘human_v1.sav’


2023-06-08 04:51:28 (3.15 MB/s) - ‘human_v1.sav’ saved [2542758/2542758]



### Download sequence and interaction files for test datasets

In [4]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2023-06-08 04:51:29--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2023-06-08 04:51:29 (5.04 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2023-06-08 04:51:30--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘ecoli.fasta’


2023-06-08 04:51:30 

### Chooose a subset of the dataset

In [5]:
import pandas as pd
df_pairs = pd.read_csv('ecoli_test.tsv', sep='\t', header=None)

# select a random subset of the data
df_sub = df_pairs.sample(n=6000).reset_index(drop=True)  # change n to the number of samples you want

# save the subset tsv file
df_sub.to_csv('ecoli_test_sub.tsv', sep='\t', index=False, header=None)

### Create a new subset fasta file

In [6]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# read the identifiers from the subset tsv file
unique_ids = set(df_sub.iloc[:,0]) | set(df_sub.iloc[:,1])  
print("Number of sequences in the interaction subset:", len(unique_ids))

# read the original fasta file and extract sequences
records = SeqIO.parse('ecoli.fasta', 'fasta')

# create a dictionary to hold unique records
record_dict = {rec.id: rec for rec in records if rec.id in unique_ids}

# open the output file
with open('ecoli_sub.fasta', 'w') as output_file:
    for rec in record_dict.values():
        # manually construct the FASTA record and write it to file
        output_file.write('>' + rec.description + '\n' + str(rec.seq) + '\n')

Number of sequences in the interaction subset: 5045


### Generate embeddings

In [7]:
!rm ecoli.h5

rm: cannot remove 'ecoli.h5': No such file or directory


In [8]:
!dscript embed --seqs ecoli_sub.fasta -o ecoli.h5 -d 0

[2023-06-08-04:51:34] # Using CUDA device 0 - NVIDIA A100-SXM4-80GB
[2023-06-08-04:51:34] # Loading Model...
[2023-06-08-04:51:34] Downloading model lm_v1 from http://cb.csail.mit.edu/cb/dscript/data/models/dscript_lm_v1.pt...
[2023-06-08-04:51:51] # Loading Sequences...
100%|████████████████████████████████████| 5045/5045 [00:00<00:00, 38693.41it/s]
[2023-06-08-04:51:51] # 5045 Sequences Loaded
[2023-06-08-04:51:51] # Approximate Storage Required (varies by average sequence length): ~40.36GB
[2023-06-08-04:51:51] # Storing to ecoli.h5...
100%|███████████████████████████████████████| 5045/5045 [06:11<00:00, 13.58it/s]


### Evaluate on test dataset

In [9]:
# Evaluate with Human D-SCRIPT model 
# Evaluate with Human D-SCRIPT model 
# If GPU has RAM more than CPU, you should add flag -d 0
# If OOM occurs, you can downsample the subset of interaction by lowering the number of samples n
!dscript evaluate --model human_v1.sav --test ecoli_test_sub.tsv --embedding ecoli.h5 --outfile ecoli_dscript -d 0

[2023-06-08-04:58:07] Using CUDA device 0 - NVIDIA A100-SXM4-80GB
100%|██████████████████████████████████████| 5045/5045 [00:09<00:00, 536.03it/s]
Predicting pairs: 100%|████████████████████| 6000/6000 [00:30<00:00, 196.04it/s]
[2023-06-08-04:58:53] AUPR: 0.5643158867395094
[2023-06-08-04:58:53] AUROC: 0.8613230310902109


### Read prediction file

In [12]:
import pandas as pd
df = pd.read_csv('ecoli_dscript.predictions.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3
0,362663.ecp:ECP_2658,362663.ECP_0557,0.0,0.004366
1,362663.ECP_0690,362663.ECP_1579,0.0,0.006131
2,362663.ecp:ECP_3352,362663.ECP_2911,0.0,0.004175
3,362663.ECP_4615,362663.ECP_2536,0.0,0.004326
4,362663.ECP_0746,362663.ECP_1247,0.0,0.004174
...,...,...,...,...
5995,362663.ECP_1497,362663.ECP_3071,0.0,0.007715
5996,362663.ECP_1654,362663.ECP_1905,0.0,0.004181
5997,362663.ECP_2608,362663.ECP_1830,0.0,0.007167
5998,362663.ECP_3182,362663.ecp:ECP_0350,0.0,0.004200


### Compute performance metrics

In [13]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred)

# Precision
prec = precision_score(y_true, y_pred)

# Recall
rec = recall_score(y_true, y_pred)

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = tn / (tn+fp)

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred)

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob)

# AUPRC
auprc = average_precision_score(y_true, y_prob)

print (f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, mcc: {mcc} ,f1-score: {f1}, auc: {auroc}, prc: {auprc}')



accuracy: 0.9266666666666666, precision: 0.7672727272727272, recall: 0.3594548551959114, specificity: 0.9881766118603362, mcc: 0.49385909128776545 ,f1-score: 0.48955916473317856, auc: 0.8613194118141354, prc: 0.5642862257138574
