### Check GPU hardware

In [6]:
!nvidia-smi

Wed Jun  7 17:16:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:03:00.0 Off |                  Off |
|  0%   46C    P8    19W / 450W |      0MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### Save hardware configuration

In [1]:
# Create the summary file
!echo "Hardware Summary" > hardware_summary.txt

# Fetch and write CPU Information
!echo "\nCPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!lscpu | egrep 'Model name|Socket|Thread|CPU\(s\)' >> hardware_summary.txt

# Fetch and write Total RAM Information
!echo "\nTotal RAM Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!free -h | grep Mem | awk '{print $2}' >> hardware_summary.txt


# Fetch and write GPU Information
!echo "\nGPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt

# If you have a Nvidia GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv >> hardware_summary.txt

# Alternatively for other GPUs
# !lspci | grep VGA >> hardware_summary.txt


### Install D-SCRIPT

In [3]:
!pip install dscript

Collecting dscript
  Downloading dscript-0.2.4-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.8/121.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas
  Downloading pandas-2.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib
  Downloading matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━

In [None]:
import os
# Set OMP_NUM_THREADS to "1", limiting OpenMP to single-threaded operation
os.environ["OMP_NUM_THREADS"] = "1"

### Download trained models

In [7]:
# Download Human Topsy-Turvy model
!wget http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav

--2023-06-07 17:17:05--  http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.30.2.148
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.30.2.148|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2543014 (2.4M)
Saving to: ‘topsy_turvy_v1.sav.1’


2023-06-07 17:17:09 (1.38 MB/s) - ‘topsy_turvy_v1.sav.1’ saved [2543014/2543014]



### Download sequence and interaction files for test datasets

In [8]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2023-06-07 17:17:11--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2023-06-07 17:17:12 (5.92 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2023-06-07 17:17:12--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘ecoli.fasta’


2023-06-07 17:17:14 

### Chooose a subset of the dataset

In [9]:
import pandas as pd
df_pairs = pd.read_csv('ecoli_test.tsv', sep='\t', header=None)

# select a random subset of the data
df_sub = df_pairs.sample(n=1000).reset_index(drop=True)  # change n to the number of samples you want

# save the subset tsv file
df_sub.to_csv('ecoli_test_sub.tsv', sep='\t', index=False, header=None)

### Create a new subset fasta file

In [10]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# read the identifiers from the subset tsv file
unique_ids = set(df_sub.iloc[:,0]) | set(df_sub.iloc[:,1])  
print("Number of sequences in the interaction subset:", len(unique_ids))

# read the original fasta file and extract sequences
records = SeqIO.parse('ecoli.fasta', 'fasta')

# create a dictionary to hold unique records
record_dict = {rec.id: rec for rec in records if rec.id in unique_ids}

# open the output file
with open('ecoli_sub.fasta', 'w') as output_file:
    for rec in record_dict.values():
        # manually construct the FASTA record and write it to file
        output_file.write('>' + rec.description + '\n' + str(rec.seq) + '\n')

Number of sequences in the interaction subset: 1630


### Generate embeddings

In [11]:
!dscript embed --seqs ecoli_sub.fasta -o ecoli.h5 -d 0

[2023-06-07-17:17:25] # Using CUDA device 0 - NVIDIA GeForce RTX 4090
[2023-06-07-17:17:25] # Loading Model...
[2023-06-07-17:17:25] Downloading model lm_v1 from http://cb.csail.mit.edu/cb/dscript/data/models/dscript_lm_v1.pt...
[2023-06-07-17:18:11] # Loading Sequences...
100%|████████████████████████████████████| 1630/1630 [00:00<00:00, 26368.99it/s]
[2023-06-07-17:18:12] # 1630 Sequences Loaded
[2023-06-07-17:18:12] # Approximate Storage Required (varies by average sequence length): ~13.040000000000001GB
[2023-06-07-17:18:12] # Storing to ecoli.h5...
100%|███████████████████████████████████████| 1630/1630 [02:08<00:00, 12.70it/s]


### Evaluate on test dataset

In [4]:
# Evaluate with Human Topsy-Turvy model 
!dscript evaluate --model topsy_turvy_v1.sav --test ecoli_test_sub.tsv --embedding ecoli.h5 --outfile ecoli_topsy_turvy -d 0

[2023-06-07-17:30:59] Using CUDA device 0 - NVIDIA GeForce RTX 4090
100%|██████████████████████████████████████| 1630/1630 [00:05<00:00, 306.62it/s]
Predicting pairs: 100%|████████████████████| 1000/1000 [00:06<00:00, 158.19it/s]
[2023-06-07-17:31:12] AUPR: 0.40263452596628857
[2023-06-07-17:31:12] AUROC: 0.6525066596794621


### Read prediction file

In [5]:
import pandas as pd
df = pd.read_csv('ecoli_topsy_turvy.predictions.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3
0,362663.ecp:ECP_3894,362663.ECP_0518,0.0,0.498930
1,362663.ECP_4577,362663.ECP_0508,0.0,0.493070
2,362663.ECP_2026,362663.ECP_2825,0.0,0.493020
3,362663.ECP_0189,362663.ECP_2075,0.0,0.009941
4,362663.ECP_2795,362663.ECP_2100,0.0,0.384530
...,...,...,...,...
995,362663.ECP_2538,362663.ECP_2541,0.0,0.019657
996,362663.ECP_0363,362663.ECP_2211,0.0,0.496060
997,362663.ECP_3273,362663.ECP_3404,1.0,0.006206
998,362663.ECP_2647,362663.ECP_2610,0.0,0.004187


### Compute performance metrics

In [6]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred)

# Precision
prec = precision_score(y_true, y_pred)

# Recall
rec = recall_score(y_true, y_pred)

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = tn / (tn+fp)

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred)

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob)

# AUPRC
auprc = average_precision_score(y_true, y_prob)

print (f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, mcc: {mcc} ,f1-score: {f1}, auc: {auroc}, prc: {auprc}')



accuracy: 0.88, precision: 0.423728813559322, recall: 0.49019607843137253, specificity: 0.9242761692650334, mcc: 0.3888287903905035 ,f1-score: 0.4545454545454545, auc: 0.6524902834184898, prc: 0.40261421520432356
