### Check GPU hardware

In [1]:
!nvidia-smi

Mon Jun  5 06:29:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:0B:00.0 Off |                    0 |
| N/A   28C    P0    61W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### Save hardware configuration

In [2]:
# Create the summary file
!echo "Hardware Summary" > hardware_summary.txt

# Fetch and write CPU Information
!echo "\nCPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!lscpu | egrep 'Model name|Socket|Thread|CPU\(s\)' >> hardware_summary.txt

# Fetch and write Total RAM Information
!echo "\nTotal RAM Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!free -h | grep Mem | awk '{print $2}' >> hardware_summary.txt


# Fetch and write GPU Information
!echo "\nGPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt

# If you have a Nvidia GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv >> hardware_summary.txt

# Alternatively for other GPUs
# !lspci | grep VGA >> hardware_summary.txt


### Install D-SCRIPT

In [3]:
!pip install dscript
import os
# Set OMP_NUM_THREADS to "1", limiting OpenMP to single-threaded operation
os.environ["OMP_NUM_THREADS"] = "1"

Collecting dscript
  Downloading dscript-0.2.4-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.8/121.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting biopython
  Downloading biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (29

### Download trained models

In [4]:
# Download Human D-SCRIPT model 
!wget http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav

--2023-06-05 06:31:00--  http://cb.csail.mit.edu/cb/dscript/data/models/human_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.30.2.148
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.30.2.148|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2542758 (2.4M)
Saving to: ‘human_v1.sav’


2023-06-05 06:31:01 (3.17 MB/s) - ‘human_v1.sav’ saved [2542758/2542758]



### Download sequence and interaction files for test datasets

In [5]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2023-06-05 06:31:24--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2023-06-05 06:31:24 (4.48 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2023-06-05 06:31:24--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘ecoli.fasta’


2023-06-05 06:31:25 

### Chooose a subset of the dataset

In [6]:
import pandas as pd
df_pairs = pd.read_csv('ecoli_test.tsv', sep='\t', header=None)

# select a random subset of the data
df_sub = df_pairs.sample(n=6000).reset_index(drop=True)  # change n to the number of samples you want

# save the subset tsv file
df_sub.to_csv('ecoli_test_sub.tsv', sep='\t', index=False)

In [7]:
from Bio import SeqIO

# read the identifiers from the subset tsv file
identifiers = set(df_sub.iloc[:,0]) | set(df_sub.iloc[:,1])  # replace 'identifier' with the actual column name

# read the original fasta file and extract sequences
records = SeqIO.parse('ecoli.fasta', 'fasta')
subset_records = [rec for rec in records if rec.id in identifiers]

# write the extracted sequences to the new fasta file
SeqIO.write(subset_records, 'ecoli_sub.fasta', 'fasta')


12795

### Generate embeddings

In [None]:
!dscript embed --seqs ecoli_sub.fasta -o ecoli.h5 -d 0

[2023-06-05-06:33:09] # Using CUDA device 0 - NVIDIA A100-SXM4-80GB
[2023-06-05-06:33:09] # Loading Model...
[2023-06-05-06:33:09] Downloading model lm_v1 from http://cb.csail.mit.edu/cb/dscript/data/models/dscript_lm_v1.pt...
[2023-06-05-06:33:28] # Loading Sequences...
100%|██████████████████████████████████| 12795/12795 [00:00<00:00, 46950.38it/s]
[2023-06-05-06:33:28] # 12795 Sequences Loaded
[2023-06-05-06:33:28] # Approximate Storage Required (varies by average sequence length): ~102.36GB
[2023-06-05-06:33:28] # Storing to ecoli.h5...
  8%|███                                   | 1046/12795 [00:25<04:47, 40.91it/s]

### Evaluate on test dataset

In [6]:
# Evaluate with Human D-SCRIPT model 
!dscript evaluate --model human_v1.sav --test ecoli_test_sub.tsv --embedding ecoli.h5 --outfile ecoli_dscript

[2023-06-05-05:16:29] Using CPU
100%|██████████████████████████████████████| 7138/7138 [00:28<00:00, 249.02it/s]
Predicting pairs: 100%|███████████████████| 22000/22000 [09:37<00:00, 38.08it/s]
[2023-06-05-05:26:35] AUPR: 0.5349755785480388
[2023-06-05-05:26:35] AUROC: 0.8603408


### Read prediction file

In [17]:
import pandas as pd
df = pd.read_csv('ecoli_dscript.predictions.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3
0,362663.ECP_3406,362663.ECP_4448,1.0,0.022358
1,362663.ECP_0442,362663.ecp:ECP_0985,1.0,0.004174
2,362663.ECP_3384,362663.ECP_4447,1.0,0.879780
3,362663.ECP_0161,362663.ecp:ECP_3117,1.0,0.004236
4,362663.ecp:ECP_1481,362663.ECP_2475,1.0,0.008213
...,...,...,...,...
21995,362663.ECP_2384,362663.ECP_2922,0.0,0.004224
21996,362663.ECP_1743,362663.ECP_1562,0.0,0.004226
21997,362663.ECP_3662,362663.ECP_4685,0.0,0.004199
21998,362663.ECP_3212,362663.ECP_3077,0.0,0.004173


### Compute performance metrics

In [31]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred)

# Precision
prec = precision_score(y_true, y_pred)

# Recall
rec = recall_score(y_true, y_pred)

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = tn / (tn+fp)

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred)

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob)

# AUPRC
auprc = average_precision_score(y_true, y_prob)

print (f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, mcc: {mcc} ,f1-score: {f1}, auc: {auroc}, prc: {auprc}')



accuracy: 0.9325454545454546, precision: 0.7539370078740157, recall: 0.383, specificity: 0.9875, mcc: 0.5074896801009214 ,f1-score: 0.5079575596816976, auc: 0.8603386625, prc: 0.5348992344437004
