In [2]:
import requests
import json
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib import mlab
import seaborn as sns
import pandas as pd
from scipy import stats
sns.set_palette("husl")
%matplotlib inline

In [3]:
#Url of the phageParser API
apiurl = 'http://127.0.0.1:8000'
#Get the initial page for listing of accessible objects and get url for organisms and casproteins
url_json=requests.get(apiurl).json()
organisms_url = url_json['organisms']
cas_proteins_url = url_json['casproteins']

In [4]:
#Iterate through each page and merge the json response into a dictionary for cas proteins
cas_protein_dict = {}
r=requests.get(cas_proteins_url)
last_page = r.json()['meta']['total_pages']
for page in range(1,last_page+1):
    url = cas_proteins_url+'?page={}'.format(page)
    payload = requests.get(url).json()
    cas_protein_objs = payload['cas_proteins']
    for cas_protein_obj in cas_protein_objs:
        cas_protein_dict[cas_protein_obj['id']] = cas_protein_obj

In [5]:
#Iterate through each page and merge the json response into a dictionary for organisms
organism_dict = {}
r=requests.get(organisms_url)
last_page = r.json()['meta']['total_pages']
for page in range(1,last_page+1):
    url = organisms_url+'?page={}&include[]=cas_proteins&include[]=loci.spacers'.format(page)
    payload = requests.get(url).json()
    organism_objs = payload['organisms']
    for organism_obj in organism_objs:
        organism_dict[organism_obj['id']] = organism_obj

In [6]:
#Make a set of specific cas proteins by grouping them from gene field
cas3_proteins = {k: v for k, v in cas_protein_dict.items() if 'cas3' in v['gene']}
cas9_proteins = {k: v for k, v in cas_protein_dict.items() if 'cas9' in v['gene']}
cas10_proteins = {k: v for k, v in cas_protein_dict.items() if 'cas10' in v['gene']}

In [32]:
#Convert dictionary to dataframe with added annotations
org_df={}
loc_df={}
for k,v in organism_dict.items():
    org = {'accession':v['accession'], 'name':v['name'], 'CRISPR_type':'','single_spacer':False}
    if v['cas_proteins']:
        org_proteins = set(v['cas_proteins'])
        if not org_proteins.isdisjoint(cas3_proteins): #has cas3 proteins
            org['CRISPR_type'] += 'Type I'
        if not org_proteins.isdisjoint(cas9_proteins): #has cas9 proteins
            org['CRISPR_type'] += 'Type II'
        if not org_proteins.isdisjoint(cas10_proteins): #has cas10 proteins
            org['CRISPR_type'] += 'Type III'
    if v['loci']:
        if all([len(loc['spacers']) < 2 for loc in v['loci']]):
            org['single_spacer'] = True
    for locus in v['loci']:
        loc = {'org_id':k, 'num_spacers':len(locus['spacers'])}
        spacerlens = [spacer['length'] for spacer in locus['spacers']]
        avg_spacerlens = sum(spacerlens)/len(spacerlens)
        loc['avg_spacerlens'] = avg_spacerlens
        loc_df[locus['id']] = loc
    org_df[k] = org
loc_df = pd.DataFrame.from_dict(loc_df, orient='index')
org_df = pd.DataFrame.from_dict(org_df, orient='index')

In [51]:
a=loc_df[loc_df['num_spacers'] == 1].copy()

In [19]:
org_df.head()

Unnamed: 0,accession,name,CRISPR_type,single_spacer
1,NZ_LT632614,Legionella pneumophila,Type IType II,False
2,NC_014206,Geobacillus sp. C56-T3,Type IType III,False
3,NZ_CP013216,Streptococcus salivarius,Type IType III,False
4,NC_020411,Hydrogenobaculum sp. HO,Type IType III,False
5,NC_010337,Heliobacterium modesticaldum Ice1,Type IType II,False


In [52]:
a['single_spacer_organism'] = list(org_df.iloc[a.org_id].single_spacer)

In [57]:
len(a[a['single_spacer_organism']  == True])/len(a)*100

5.29595015576324

In [58]:
len(a[a['single_spacer_organism'] == False])/len(a)*100

94.70404984423676

In [48]:
len(a)

963