### Check GPU hardware

In [1]:
!nvidia-smi

Wed Jun  7 16:13:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Save hardware configuration

In [2]:
# Create the summary file
!echo "Hardware Summary" > hardware_summary.txt

# Fetch and write CPU Information
!echo "\nCPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!lscpu | egrep 'Model name|Socket|Thread|CPU\(s\)' >> hardware_summary.txt

# Fetch and write Total RAM Information
!echo "\nTotal RAM Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt
!free -h | grep Mem | awk '{print $2}' >> hardware_summary.txt


# Fetch and write GPU Information
!echo "\nGPU Information:" >> hardware_summary.txt
!echo "-----------------" >> hardware_summary.txt

# If you have a Nvidia GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv >> hardware_summary.txt

# Alternatively for other GPUs
# !lspci | grep VGA >> hardware_summary.txt


### Install D-SCRIPT

In [3]:
!pip install dscript

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dscript
  Downloading dscript-0.2.4-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.8/121.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting biopython (from dscript)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython, dscript
Successfully installed biopython-1.81 dscript-0.2.4


### Download trained models

In [5]:
# Download Human Topsy-Turvy model
!wget http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav

--2023-06-07 16:14:01--  http://cb.csail.mit.edu/cb/dscript/data/models/topsy_turvy_v1.sav
Resolving cb.csail.mit.edu (cb.csail.mit.edu)... 128.30.2.148
Connecting to cb.csail.mit.edu (cb.csail.mit.edu)|128.30.2.148|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2543014 (2.4M)
Saving to: ‘topsy_turvy_v1.sav’


2023-06-07 16:14:04 (1.56 MB/s) - ‘topsy_turvy_v1.sav’ saved [2543014/2543014]



### Download sequence and interaction files for test datasets

In [6]:
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
!wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta


--2023-06-07 16:14:09--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 812468 (793K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2023-06-07 16:14:09 (80.2 MB/s) - ‘ecoli_test.tsv’ saved [812468/812468]

--2023-06-07 16:14:09--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Dscript-data/seqs/ecoli.fasta
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5413511 (5.2M) [text/plain]
Saving to: ‘ecoli.fasta’


2023-06-07 16:14:10 

### Chooose a subset of the dataset

In [43]:
import pandas as pd
df_pairs = pd.read_csv('ecoli_test.tsv', sep='\t', header=None)

# select a random subset of the data
df_sub = df_pairs.sample(n=1000).reset_index(drop=True)  # change n to the number of samples you want

# save the subset tsv file
df_sub.to_csv('ecoli_test_sub.tsv', sep='\t', index=False, header=None)

### Create a new subset fasta file

In [47]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# read the identifiers from the subset tsv file
unique_ids = set(df_sub.iloc[:,0]) | set(df_sub.iloc[:,1])  
print("Number of sequences in the interaction subset:", len(unique_ids))

# read the original fasta file and extract sequences
records = SeqIO.parse('ecoli.fasta', 'fasta')

# create a dictionary to hold unique records
record_dict = {rec.id: rec for rec in records if rec.id in unique_ids}

# open the output file
with open('ecoli_sub.fasta', 'w') as output_file:
    for rec in record_dict.values():
        # manually construct the FASTA record and write it to file
        output_file.write('>' + rec.description + '\n' + str(rec.seq) + '\n')

Number of sequences in the interaction subset: 1638


### Generate embeddings

In [45]:
!dscript embed --seqs ecoli_sub.fasta -o ecoli.h5 -d 0

[2023-06-07-16:33:15] # Using CUDA device 0 - Tesla T4
[2023-06-07-16:33:15] # Loading Model...
[2023-06-07-16:33:19] # Loading Sequences...
100% 1638/1638 [00:00<00:00, 32510.29it/s]
[2023-06-07-16:33:19] # 1638 Sequences Loaded
[2023-06-07-16:33:19] # Approximate Storage Required (varies by average sequence length): ~13.104000000000001GB
[2023-06-07-16:33:19] # Storing to ecoli.h5...
100% 1638/1638 [04:39<00:00,  5.85it/s]


### Evaluate on test dataset

In [49]:
# Evaluate with Human Topsy-Turvy model 
!dscript evaluate --model topsy_turvy_v1.sav --test ecoli_test_sub.tsv --embedding ecoli.h5 --outfile ecoli_topsy_turvy -d 1

[2023-06-07-16:48:12] Using CUDA device 0 - Tesla T4
 54% 884/1638 [00:29<00:24, 30.44it/s]
Traceback (most recent call last):
  File "/usr/local/bin/dscript", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/dscript/__main__.py", line 77, in main
    args.func(args)
  File "/usr/local/lib/python3.10/dist-packages/dscript/commands/evaluate.py", line 217, in main
    embeddings = load_hdf5_parallel(embPath, allProteins)
  File "/usr/local/lib/python3.10/dist-packages/dscript/utils.py", line 75, in load_hdf5_parallel
    all_embs = list(
  File "/usr/local/lib/python3.10/dist-packages/tqdm/std.py", line 1178, in __iter__
    for obj in iterable:
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 873, in next
    raise value
multiprocessing.pool.MaybeEncodingError: Error sending result: 'tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -4.5643e-01,
           3.9842e-02,  5.7106e-04],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00,  .

### Read prediction file

In [41]:
import pandas as pd
df = pd.read_csv('ecoli_topsy_turvy.predictions.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2,3
0,362663.ECP_2819,362663.ECP_2750,0.0,0.4301
1,362663.ecp:ECP_3270,362663.ECP_0479,0.0,0.007664


### Compute performance metrics

In [42]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, matthews_corrcoef

import numpy as np

# Extract prediction and true labels
y_true = df.iloc[:, 2]
y_prob = df.iloc[:, 3]
y_pred = np.round(y_prob)

# Accuracy
acc = accuracy_score(y_true, y_pred)

# Precision
prec = precision_score(y_true, y_pred)

# Recall
rec = recall_score(y_true, y_pred)

# Specificity
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
spec = tn / (tn+fp)

# MCC score
mcc = matthews_corrcoef(y_true, y_pred)

# F1 score
f1 = f1_score(y_true, y_pred)

# AUC-ROC
auroc = roc_auc_score(y_true, y_prob)

# AUPRC
auprc = average_precision_score(y_true, y_prob)

print (f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, mcc: {mcc} ,f1-score: {f1}, auc: {auroc}, prc: {auprc}')



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: ignored