# Data preprocessing

---

## TODO

1. Prepare positive & negative pairs from all of the features

---

## Setup

1. Imports

In [1]:
import csv
from pathlib import Path
from collections import defaultdict
import json
import os
import pandas as pd
import re

2. Paths

In [2]:
user = os.environ['USER']
PROJECT_DIR = Path(f'/home/{user}/workspace/wirusy')
HOST_JSON = PROJECT_DIR / 'host.json'
VIRUS_JSON = PROJECT_DIR / 'virus.json'
FEATURES_DIR = PROJECT_DIR / 'features'
HOST_DIR = PROJECT_DIR / 'data' / 'host'
VIRUS_DIR = PROJECT_DIR / 'data' / 'virus'
PROTEIN_DIR = PROJECT_DIR / 'protein_domains'

assert PROJECT_DIR.exists() \
   and HOST_JSON.exists() \
   and VIRUS_JSON.exists() \
   and FEATURES_DIR.exists() \
   and HOST_DIR.exists() \
   and VIRUS_DIR.exists() \
   and PROTEIN_DIR.exists()

---

Import features from `tsv` files to dictionary

## Add blastn_rank feature

In [3]:
blastn_score = None
with open(FEATURES_DIR / 'blastn.tsv') as fp:
    blastn_score = pd.read_csv(fp, header=0, index_col=[0, 1], sep='\t')

df_to_concat = []
for virus, bacterias in blastn_score.groupby('#virus'):
    rank_dict = {value:rank for rank, value in enumerate(bacterias['score'].unique(), 1)}
    df_to_concat.append(bacterias['score'].map(rank_dict))

blastn_rank = pd.concat(df_to_concat).to_frame()

with open(FEATURES_DIR / 'blastn_rank.tsv', 'w') as fp:
    blastn_rank.to_csv(fp, sep='\t')

## Add jaccard_index_protein_domains feature

In [4]:
virus_json = json.loads(VIRUS_JSON.read_text())
host_json = json.loads(HOST_JSON.read_text())
data_dict = {'virus':[], 'host':[], 'jaccard_index':[], 'jaccard_index_low_divide':[], 'jaccard_index_no_divide':[]}
virus_domains = dict()
host_domains = dict()

for virus in virus_json:
    # Read protein domains of given virus
    virus_protein_domains = set()
    virus_protein_path = PROJECT_DIR / 'protein_domains' / 'virus' / 'pfam' / f'{virus}.txt'
    string = virus_protein_path.read_text()
    for line in (line.split() for line in string.split('\n') if not line.startswith('#')):
        if len(line) == 0:
            continue
        virus_protein_domains.add(line[5])
    virus_domains[virus] = virus_protein_domains

for host in host_json:
    # Read protein domains of given host
    host_protein_domains = set()
    host_protein_path = PROJECT_DIR / 'protein_domains' / 'host' / 'pfam' / f'{host}.txt'
    string = host_protein_path.read_text()
    for line in (line.split() for line in string.split('\n') if not line.startswith('#')):
        if len(line) == 0:
            continue
        host_protein_domains.add(line[5])
    host_domains[host] = host_protein_domains

for virus in virus_json:    
    for host in host_json:
        # Assign jaccard index to a pair
        jaccard_index = len(virus_domains[virus].intersection(host_domains[host])) / len(virus_domains[virus].union(host_domains[host]))
        jaccard_index_low_divide = len(virus_domains[virus].intersection(host_domains[host])) / min(max(len(virus_domains[virus]), 1), len(host_domains[host]))
        jaccard_index_no_divide = len(virus_domains[virus].intersection(host_domains[host]))
        data_dict['virus'].append(virus) 
        data_dict['host'].append(host) 
        data_dict['jaccard_index'].append(jaccard_index)
        data_dict['jaccard_index_low_divide'].append(jaccard_index_low_divide)
        data_dict['jaccard_index_no_divide'].append(jaccard_index_no_divide)

jaccard_df = pd.DataFrame.from_dict(data_dict)
jaccard_df.to_csv(FEATURES_DIR / 'jaccard_protein.tsv', sep='\t', index=False)

In [5]:
blastn = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'blastn.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        blastn[(line['#virus'], line['bacteria'])] = line['score']

crisprdetect_2mismatch = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'crisprdetect-2mismatch.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        crisprdetect_2mismatch[(line['#virus'], line['bacteria'])] = line['score']

gc_content = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'gc_content.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        gc_content[(line['#virus'], line['bacteria'])] = line['score']

k6_chebyshev = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'k6-chebyshev.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        k6_chebyshev[(line['#virus'], line['bacteria'])] = line['score']

k6_kendalltau = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'k6-kendalltau.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        k6_kendalltau[(line['#virus'], line['bacteria'])] = line['score']

k6_manhattan = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'k6-manhattan.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        k6_manhattan[(line['#virus'], line['bacteria'])] = line['score']

k25 = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'k25.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        k25[(line['#virus'], line['bacteria'])] = line['score']

piler_2mismatch = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'piler-2mismatch.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        piler_2mismatch[(line['#virus'], line['bacteria'])] = line['score']

wish = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'wish.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        wish[(line['#virus'], line['bacteria'])] = line['score']

blastn_rank = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'blastn_rank.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        blastn_rank[(line['#virus'], line['bacteria'])] = line['score']

jaccard_protein = defaultdict(lambda: 0)
jaccard_protein_low_divide = defaultdict(lambda: 0)
jaccard_protein_no_divide = defaultdict(lambda: 0)
with open(FEATURES_DIR / 'jaccard_protein.tsv') as fp:
    csv_file = csv.DictReader(fp, delimiter = '\t')
    for line in csv_file:
        jaccard_protein[(line['virus'], line['host'])] = line['jaccard_index']
        jaccard_protein_low_divide[(line['virus'], line['host'])] = line['jaccard_index_low_divide']
        jaccard_protein_no_divide[(line['virus'], line['host'])] = line['jaccard_index_no_divide']


Open `json` files

In [6]:
virus_json = json.loads(VIRUS_JSON.read_text())
host_json = json.loads(HOST_JSON.read_text())

Create positive and negative pairs and save `csv` file

In [9]:
headers = ['virus', 'host', 'blastn_score', 'blastn_rank', 'jaccard_protein', 'jaccard_protein_low_divide', 'jaccard_protein_no_divide',
           'crisprdetect_2mismatch_score', 'gc_content_score', 'k6_chebyshev_score', 'k6_kendalltau_score',
           'k6_manhattan_score', 'k25_score', 'piler_2mismatch_score', 
           'wish_score', 'interaction']

csv_data = []

for virus, virus_data in virus_json.items():
    # -1 is for species name
    real_host = virus_data['host']['lineage_names'][-1]
    for host, host_data in host_json.items():
        blastn_score = blastn[(virus, host)]
        blastn_rank_score = blastn_rank[(virus, host)]
        crisprdetect_2mismatch_score = crisprdetect_2mismatch[(virus, host)]
        gc_content_score = gc_content[(virus, host)]
        k6_chebyshev_score = k6_chebyshev[(virus, host)]
        k6_kendalltau_score = k6_kendalltau[(virus, host)]
        k6_manhattan_score = k6_manhattan[(virus, host)]
        k25_score = k25[(virus, host)]
        piler_2mismatch_score = piler_2mismatch[(virus, host)]
        jaccard_protein_index = jaccard_protein[(virus, host)]
        jaccard_protein_index_low_divide = jaccard_protein_low_divide[(virus, host)]
        jaccard_protein_index_no_divide = jaccard_protein_no_divide[(virus, host)]
        wish_score = wish[(virus, host)]
        interaction = 1 if real_host == host_data['lineage_names'][-1] else 0
        data = {'virus': virus, 'host': host, 'blastn_score': blastn_score,
                'blastn_rank': blastn_rank_score, 'jaccard_protein': jaccard_protein_index,
                'jaccard_protein_low_divide': jaccard_protein_index_low_divide,
                'jaccard_protein_no_divide': jaccard_protein_index_no_divide,
                'crisprdetect_2mismatch_score': crisprdetect_2mismatch_score,
                'gc_content_score': gc_content_score, 'k6_chebyshev_score': k6_chebyshev_score,
                'k6_kendalltau_score': k6_kendalltau_score, 'k6_manhattan_score': k6_manhattan_score,
                'k25_score': k25_score, 'piler_2mismatch_score': piler_2mismatch_score,
                'wish_score': wish_score, 'interaction': interaction}
        csv_data.append(data)

with open(PROJECT_DIR / 'dataframe.csv', mode='w') as fp:
    writer = csv.DictWriter(fp, fieldnames=headers)
    writer.writeheader()
    writer.writerows(csv_data)

## Leave-One-Out Cross Validation

- Take viruses that infect same hosts family
- 820 viruses, group by host family
  - Train on every other group
  - Create positive and negative pairs from every group except the one that I'm testing

---

Create groups of bacterial host families and put viruses that attack this family

In [None]:
virus_groups = defaultdict(list)
for virus, virus_data in virus_json.items():
    # -3 is for family name
    virus_groups[virus_data['host']['lineage_names'][-3]].append(virus)

with open(PROJECT_DIR / 'virus-groups.json', mode='w') as fp:
    json.dump(virus_groups, fp, indent=2)