# AntiRef: sequence filtering

Sequences from the [Observed Antibody Space](http://opig.stats.ox.ac.uk/webapps/oas/) repository are filtered to retain only sequences that are:

| criteria | filter |
| :- | :- |
| productive | `'productive' == 'T'` |
| full length | `'complete_vdj' == 'T'` |
| no frameshift V-gene indels | `'v_frameshift' == 'F'` |
| in-frame | `'vj_in_frame' == 'T'` | 
| no stop codons | `'stop_codon' == 'F'` |
| no ambiguous amino acids | `'X' not in 'sequence_aa'` |
| not missing a conserved Cysteine | `'Missing Conserved Cysteine' not in 'ANARCI_status'` |


Also during the filtering process, a random ID is created for each sequence (stored in the `'sequence_id'` field) and the full AA sequence is added to the annotation (stored in the `'sequence_aa'` field). 

Filtered data is saved in both CSV (containing all annotations) and FASTA formats. Optionally, but occuring by default using the code below, a CSV file containing metadata for each processed file will created. This metadata file contains all metadata included in the downloaded OAS file as well as information on the number of sequences passing our filtering process. 

Filtering retains a total of:

* **260,373,862** heavy chains
* **190,684,852** light chains
* **451,058,704** total antibody sequences



In [None]:
# install dependencies
%pip install pandas tqdm biopython abutils

In [None]:
import warnings
warnings.simplefilter('ignore')

from collections import Counter
import json
import os
import subprocess as sp
from typing import Optional
import uuid

import pandas as pd

from tqdm.notebook import tqdm

from Bio import SeqIO

import abutils

In [3]:
def oas_filter(
    input_dir: str, 
    csv_dir: str, 
    fasta_dir: str, 
    limit: Optional[int] = None,
    skip: int = 0,
    metadata_file: Optional[str] = None
) -> pd.DataFrame:
    '''
    Filters sequence data downloaded from the `Observed Antibody Space`_ 
    repository to retain only full-length, productive sequences.
    
    Parameters
    ----------
    input_dir : str
        Path to a directory containing one or more CSV-formatted input files. Files must either
        be downloaded from OAS or use the same schema.
    
    [csv|fasta]_dir : str
        Path to a directory into which filtered CSV or FASTA files will be written. If the directory
        does not exist, it will be created.
        
    limit : int, default=``None``
        Limit the number of files to be processed. Must be an integer or coercable to an integer.
        Default is `None`, which does not limit the number of files processed.
    
    skip : int, default=0
        Skips the first `skip` files in `input_dir`. Must be an integer or coercable to an integer.
        Files are processed in alphabetical order. Default is `0`, which does not skip any lines.
        
    metadata_file : str, default=``None``
        Path to a file into which the metadata (in CSV format) will be written. If not provided,
        metadata is not written. In both cases, metadata is also returned as a ``DataFrame``.
    
    Returns
    -------
    metadata : ``pd.DataFrame``
        A ``DataFrame`` containing the metadata from all processed files. Metadata is parsed
        from the first line of the OAS CSV file.
        
        
    .. _Observed Antibody Space:
        http://opig.stats.ox.ac.uk/webapps/oas/
    '''
    # configure input/output directories
    csv_dir = os.path.abspath(csv_dir)
    fasta_dir = os.path.abspath(fasta_dir)
    if not os.path.isdir(csv_dir):
        abutils.io.make_dir(csv_dir)
    if not os.path.isdir(fasta_dir):
        abutils.io.make_dir(fasta_dir)
    input_dir = os.path.abspath(input_dir)
    input_files = abutils.io.list_files(input_dir, extension='csv')
    if limit is not None:
        limit = int(limit)
        skip = int(skip)
        input_files = input_files[skip:skip + limit]
    
    metadata = []
    aa_cols = ['fwr1_aa', 'cdr1_aa', 'fwr2_aa', 'cdr2_aa', 'fwr3_aa', 'cdr3_aa', 'fwr4_aa']
    leading_cols = ['sequence_id', 'sequence', 'sequence_aa']
    
    pbar = tqdm(input_files)
    for ifile in pbar:
        fname = '.'.join(os.path.basename(ifile).split('.')[:-1])
        pbar.set_description(f"{fname} - reading CSV")
        with open(ifile, 'r') as f:
            # read metadata from the first line, rest of the file is CSV data
            meta = json.loads(next(f).strip().replace('""', '"')[1:-1])
            df = pd.read_csv(f)
            pbar.set_description(f"{fname} - filtering")
            # add sequence_id and sequence_aa fields
            df['sequence_id'] = [uuid.uuid4() for _ in range(df.shape[0])]
            df['sequence_aa'] = df[aa_cols].apply(lambda row: ''.join(row.values.astype(str)), 
                                                  axis=1)
            # filter the data for productive, full-length sequences 
            filtered = df[(df['complete_vdj'] == 'T') 
                          & (df['stop_codon'] == 'F')
                          & (df['vj_in_frame'] == 'T')
                          & (df['v_frameshift'] == 'F')
                          & (df['productive'] == 'T')
                          & ~(df['sequence_aa'].str.contains('X')
                          & ~(df['ANARCI_status'].str.contains('Missing Conserved Cysteine')))
                         ]
            meta['Filtered sequences'] = filtered.shape[0]
            if filtered.shape[0] > 0:
                meta['Unique filtered sequences'] = len(filtered['sequence_aa'].unique())
                # write filtered CSV 
                trailing_cols = [c for c in filtered.columns.values if c not in leading_cols]
                filtered = filtered[leading_cols + trailing_cols]
                filtered.to_csv(os.path.join(csv_dir, fname + '.csv'), index=False)
                # write filtered FASTA
                ids = filtered['sequence_id']
                seqs = filtered['sequence_aa']
                fastas = [f">{i}\n{s}" for i, s in zip(ids, seqs)]
                with open(os.path.join(fasta_dir, fname + '.fasta'), 'w') as f:
                    f.write('\n'.join(fastas))
            else:
                meta['Unique filtered sequences'] = 0
        metadata.append(meta)
    meta_df = pd.DataFrame(metadata)
    if metadata_file is not None:
        meta_df.to_csv(metadata_file)
    return meta_df


### heavy chains

First, we filter all of the downloaded heavy chain sequences:

In [10]:
heavy_metadata = oas_filter(
    input_dir='./data/raw/csv/heavy/', 
    csv_dir='./data/filtered/csv/heavy/',
    fasta_dir='./data/filtered/fasta/heavy/'
    metadata_file='./data/filtered/metadata_heavy.csv'
    )

And create a single FASTA file containing all filtered heavy chain sequences:

In [None]:
pooled_heavy_file = './data/filtered/fasta/heavy.fasta'

pool_cmd = f"cat ./data/filtered/fasta/heavy/*.fasta > {pooled_heavy_file}"
p = sp.Popen(pool_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate()

### light chains

Next, we filter all of the light chain sequences:

In [None]:
light_metadata = oas_filter(
    input_dir='./data/raw/csv/light/', 
    csv_dir='./data/filtered/csv/light/',
    fasta_dir='./data/filtered/fasta/light/'
    metadata_file='./data/filtered/metadata_light.csv'
    )

And create a single FASTA file containing all filtered light chain sequences:

In [None]:
pooled_light_file = './data/filtered/fasta/light.fasta'

pool_cmd = f"cat ./data/filtered/fasta/light/*.fasta > {pooled_light_file}"
p = sp.Popen(pool_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate()

### combine all sequences into a single FASTA file

In [None]:
pooled_fasta_file = './data/filtered/fasta/all.fasta'

pool_cmd = f"cat ./data/filtered/fasta/heavy.fasta ./data/filtered/fasta/light.fasta > {pooled_fasta_file}"
p = sp.Popen(pool_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate()