# Simplified architectures

This script takes the domain clusters specified in 'custom_domain_clusters.csv' and uses this to translate the myraids of domain accessions from uniprot to non-redundant domain architectures for all phage lytic proteins in PhaLP. It outputs these to the file 'simplified_architectures.csv', which is used in further analyses.

In [1]:
import csv
from itertools import combinations, permutations, chain
import matplotlib.pyplot as plt
import mysql.connector
import numpy as np
import pandas as pd
import pickle

## Wrangle data

### remove known contaminants from further investigation

In [2]:
### These have been established to be contimants and hence not actual phage lytic proteins
### their non-redundant architectures are thus not generated
skip = ['P21270', 'G9IA41', 'A0A482MF56']

### Accessions

In [3]:
### query database
cnx = mysql.connector.connect(user='root', password = 'root', database='phalp', 
                              unix_socket = '/Applications/MAMP/tmp/mysql/mysql.sock')
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(UniProt_ID) FROM UniProt;")

acc = []
for i in cursor:
    if i[0] not in skip: #bypass contaminated sequences
        acc.append(i[0])
print("Amount of unique uniprot accessions: ", len(acc))

Amount of unique uniprot accessions:  11836


### Domains

In [4]:
acc2dom = {}
for i in acc:
    acc2dom[i] = []
    cnx = mysql.connector.connect(user='root', password = 'root', database='phalp', 
                                  unix_socket = '/Applications/MAMP/tmp/mysql/mysql.sock')
    cursor = cnx.cursor()
    query = "SELECT l.domains_ID, l.start, l.end FROM link_UniRef_domains as l JOIN UniProt as up WHERE up.UniProt_ID = '" + i + "' AND up.UniRef_ID = l.UniRef_ID;"
    cursor.execute(query)

    for j in cursor:
        if j[0]:
            acc2dom[i].append([j[0], j[1], j[2]])

### Domain clusters

In [5]:
dom2clst = {}
sorted_clsts = []
r = 0
with open('domains/custom_domain_clusters.csv') as file:
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        if r != 0:
            dom2clst[row[0]] = row[2]
            if row[2] not in sorted_clsts:
                sorted_clsts.append(row[2])
        r += 1
print(len(dom2clst.keys()), 'domains were simplified into', len(sorted_clsts), 'clusters.')

188 domains were simplified into 105 clusters.


## Simplification functions

In [6]:
def check_overlap(doms):
    ### order domains on upper boundary
    ordered_doms = sorted(doms, key=lambda x: x[2])
    
    if ordered_doms[0][1] >= ordered_doms[1][1]:
        return 'Full overlap'
    elif ordered_doms[0][2] > ordered_doms[1][1]:
        return 'Partial overlap'
    elif ordered_doms[0][2] <= ordered_doms[1][1]:
        return 'No overlap'

def get_smallest(doms):
    len1 = doms[0][2] - doms[0][1]
    len2 = doms[1][2] - doms[1][1]
    if len1 < len2:
        return doms[0]
    return doms[1]

def simplify(accession):
    all_doms = acc2dom[accession]
    ### remove domains that are too specific, i.e. aren't clustered
    valid_arch = [[dom2clst[i[0]], i[1], i[2]] for i in all_doms if i[0] in dom2clst.keys()]
    combos = combinations(valid_arch, 2)
    for i in combos: ### examine domains pairwise
        if check_overlap(i) != 'No overlap':
            if i[0][0] == i[1][0]: ### if overlapping domains belong to the same cluster, remove the smallest
                if get_smallest(i) in valid_arch:
                    valid_arch.remove(get_smallest(i))
            else:
                acc_of_interest.append(accession)
    return sorted(valid_arch, key=lambda x: x[2]) ### sort remaining domains

## Simplify architectures

In [7]:
acc2arch = {}
acc_of_interest = []
for i in acc:
    acc2arch[i] = simplify(i)
acc_of_interest = set(acc_of_interest)
print('There are', len(acc_of_interest), 'peculiar architectures that should be checked.\n')
print('There are', len([i for i in acc2arch.keys() if acc2arch[i] == []]), 'PhaLPs without viable architectures;')

to_rem = [i for i in acc2arch.keys() if acc2arch[i] == []]
for i in sorted(to_rem, reverse=True):
    acc2arch.pop(i)
    
print(len(acc2arch), 'simplified PhaLP-architectures remain.')

There are 154 peculiar architectures that should be checked.

There are 252 PhaLPs without viable architectures;
11584 simplified PhaLP-architectures remain.


In [8]:
with open('simplified_architectures.csv', 'w') as f:
    for acc in acc2arch.keys():
        row = str(acc)
        for arch in acc2arch[acc]:
            row += ',' + arch[0] #+ ' '
            ### uncomment the following line to output domain boundaries as well:
            #row += '(' + str(arch[1]) + '-' + str(arch[2]) + ')' 
        row += '\n'
        f.write(row)
    f.close()

In [8]:
### get individual domain accession abundances
sorted_doms = list(dom2clst.keys())
dom_ab = np.zeros((len(sorted_doms)))
for i in acc2dom:
    for j in acc2dom[i]:
        if j[0] in sorted_doms:
            dom_ab[sorted_doms.index(j[0])] += 1
dom_ab

In [10]:
### get domain cluster abundances
clst_ab = np.zeros((len(sorted_clsts)))
for i in acc2arch:
    for j in acc2arch[i]:
        clst_ab[sorted_clsts.index(j[0])] += 1
clst_ab