<a href="https://colab.research.google.com/github/areias/viral-escape/blob/main/load-hiv-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

:
# Learning the language of viral evolution and escape - Demo on HIV dataset
1.   paper  https://pubmed.ncbi.nlm.nih.gov/33446556/
2.   supplementary material https://www.science.org/doi/suppl/10.1126/science.abd7331/suppl_file/abd7331-hie-sm.pdf
3.   code https://github.com/brianhie/viral-mutation

## Loading the data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
! ls drive/MyDrive/viral-mutation/data/hiv

bg505_regions.txt  escape_dingens2019  fitness_haddox2018  HIV-1_env_samelen.fa


In [None]:
# check connected to gpu
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Jan 30 21:06:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# check ram
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [6]:
# clone repository
#! git clone https://github.com/brianhie/viral-mutation.git

In [7]:
# download data
#!wget http://cb.csail.mit.edu/cb/viral-mutation/data.tar.gz


In [8]:
# unzip data
#!tar xvf data.tar.gz

In [10]:
# install dependencies
! pip install anndata scanpy bio

Collecting anndata
  Using cached anndata-0.8.0-py3-none-any.whl (96 kB)
Collecting scanpy
  Downloading scanpy-1.9.1-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 35.1 MB/s 
[?25hCollecting bio
  Downloading bio-1.3.8-py3-none-any.whl (269 kB)
[K     |████████████████████████████████| 269 kB 92.1 MB/s 
Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting umap-learn>=0.3.10
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 9.0 MB/s 
[?25hCollecting matplotlib>=3.4
  Downloading matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 70.5 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.33.3-py3-none-any.whl (930 kB)
[K     |████████████████████████████████| 930 kB 60.5 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |████████████████████████████████| 

In [None]:
# python bin/hiv.py bilstm --train --test > hiv_train.log 2>&1

In [11]:
# add mutation to path
import sys
sys.path.append('drive/MyDrive/viral-mutation/bin')

In [12]:
from mutation import *

In [13]:
np.random.seed(1)
random.seed(1)

In [14]:
AAs = [
    'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H',
    'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W',
    'Y', 'V', 'X', 'Z', 'J', 'U', 'B',
]
vocabulary = { aa: idx + 1 for idx, aa in enumerate(sorted(AAs)) }


In [15]:
from hiv import *

In [21]:
from collections import namedtuple
arguments = namedtuple('arguments', ['model_name','train','test',
                                     'dim','n_epochs','batch_size',
                                     'namespace', 'seed','checkpoint','train_split'])

args = arguments('bilstm', True, True,
                 512,3,100, # defaults were batch-size 1000, 14 epochs
                 'hiv',1, None,None)
args

arguments(model_name='bilstm', train=True, test=True, dim=512, n_epochs=3, batch_size=100, namespace='hiv', seed=1, checkpoint=None, train_split=None)

In [24]:
def setup(args):
    fnames = [ 'drive/MyDrive/viral-mutation/data/hiv/HIV-1_env_samelen.fa' ]
    meta_fnames = [ 'drive/MyDrive/viral-mutation/data/hiv/HIV-1_env_samelen.fa' ]

    seqs = process(args, fnames, meta_fnames)

    seq_len = max([ len(seq) for seq in seqs ]) + 2
    vocab_size = len(AAs) + 2

    return seqs


In [25]:
seqs = setup(args)

In [26]:
len(seqs)

57730

In [27]:
seqs

{Seq('MRVKEKYQHLWRWGWKWGTMLLGILMICSATEKLWVTVYYGVPVWKEATTTLFC...ILL'): [{'country': 'FR',
   'seqlen': 861,
   'strain': 'IIIB_LAI',
   'subtype': 'B',
   'year': 1983},
  {'country': 'FR',
   'seqlen': 861,
   'strain': 'LAI-J19',
   'subtype': 'B',
   'year': 1983},
  {'country': 'FR',
   'seqlen': 861,
   'strain': 'LAI_BRU',
   'subtype': 'B',
   'year': 1983}],
 Seq('MRVRGTLRNYQQWWIWGVLGFWMLMICNGGGNLWVTVYYGVPVWKEAKTTLLCA...ALQ'): [{'country': 'IN',
   'seqlen': 857,
   'strain': '93IN101',
   'subtype': 'C',
   'year': 1993}],
 Seq('MRVKETQMNWPNLWKWGTLILGLVIICSASENLWVTVYYGVPVWRDADTTLFCA...ALL'): [{'country': 'TH',
   'seqlen': 857,
   'strain': '95TNIH022',
   'subtype': 'AE',
   'year': 1995},
  {'country': 'TH',
   'seqlen': 857,
   'strain': 'ThaiNIH01_C1h',
   'subtype': 'AE',
   'year': 2000}],
 Seq('MRVKETQMSWPNLWKWRTLILGLVIICSASDNLWVTVYYGVPVWRDADTTLFCA...ALL'): [{'country': 'TH',
   'seqlen': 861,
   'strain': '95TNIH047',
   'subtype': 'AE',
   'year': 1995}],
 Seq('MRVMGIQ

In [28]:
# get sequences subset
from collections import defaultdict 

seqs_subset= defaultdict(dict)

for x in list(seqs)[0:1000]:
    seqs_subset[x] = seqs[x]


In [29]:
len(seqs_subset)

1000