## SPIDER2 Secondary Structure

Creates dictionaries with SPIDER2 characteristics for each domain

SPIDER2 with HSE can be downloaded from http://sparks-lab.org/index.php/Main/Downloads.
Follow the instructions in the README to setup.

### Other Requirements:
1. List of domains
2. List of genes associated with each domain and sequences

### Running:
Run the cells in order

### Output:
A dictionary for each domain where each position has a solvent accessibility score, contact number, and predicted secondary structure, along with the other outputs of SPIDER2

### Get list of genes for all domains

In [1]:
import os
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import cPickle as pickle

# Getting path
files_dir = '/Users/davidandrewtodd/summer_research/ExAC/' 
spider_dir = '/Users/davidandrewtodd/summer_research/ExAC/SPIDER/SPIDER2/protein_seq_results'

# Reading the list of filtered domains
with open(files_dir+"5.domains_stats/filtered10_list.pik", 'rb') as handle:
    filtered_domains_list10 = pickle.load(handle)
filtered_domains_list10.sort()

# Map of all genes to protein seqs
with open(files_dir+"3.parse_HMMER/canonic_prot_seq/pfam-v30/all_domains_genes_prot_seq.pik", 'rb') as handle:
    all_genes = pickle.load(handle)

chromosome_names = ['X','Y']
for i in range(1,23):
    chromosome_names.append(str(i))

In [2]:
# Map gene IDs to sequences of canonic proteins
gene_dict = defaultdict(dict)

genes_list = []

for domain_name in filtered_domains_list10:
    in_path = files_dir+"3.parse_HMMER/hmm_domains/pfam-v30/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})

    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein id
    with open(files_dir+"4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data.loc[:,'gene']:
        # No need to process a gene twice
        if gene in gene_dict[domain_name]:
            continue
        chrom = sorted_domain_data[sorted_domain_data["gene"] == gene]["chrom_num"].unique()[0]
        if chrom not in chromosome_names:
            continue
        # Get sequence
        protein = canonic_protein[gene]
        seq = sorted_domain_data.loc[sorted_domain_data.loc[:,'prot'] == protein,'Target_Seq'].values[0]
        gene_dict[domain_name][gene] = seq.replace('-','').replace('X','').replace('.',' ').upper()
        
        genes_list.append(gene)
    print("Finished domain "+domain_name)

# Saving to file
with open(files_dir+'SPIDER/gene_dict.pik', 'wb') as handle:
    pickle.dump(gene_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 

Finished domain LCE
Finished domain LIM
Finished domain LRRC37
Finished domain LRRCT
Finished domain LRRNT
Finished domain LRR_4
Finished domain LRR_5
Finished domain LRR_6
Finished domain LRR_8
Finished domain LRR_9
Finished domain LSM
Finished domain Laminin_B
Finished domain Laminin_EGF
Finished domain Laminin_G_1
Finished domain Laminin_G_2
Finished domain Laminin_G_3
Finished domain Laminin_N
Finished domain Ldl_recept_a
Finished domain Ldl_recept_b
Finished domain Lectin_C
Finished domain Lig_chan
Finished domain Lig_chan-Glu_bd
Finished domain Linker_histone
Finished domain Lipocalin
Finished domain Lys
Finished domain MAGE
Finished domain MAGE_N
Finished domain MAM
Finished domain MARVEL
Finished domain MBOAT
Finished domain MBT
Finished domain MFS_1
Finished domain MH1
Finished domain MHC_I
Finished domain MHC_II_alpha
Finished domain MHC_II_beta
Finished domain MHC_I_C
Finished domain MIF4G
Finished domain MIP
Finished domain MIT
Finished domain MMR_HSR1
Finished domain MORN


Finished domain zf-C3HC4_2
Finished domain zf-C3HC4_3
Finished domain zf-C3HC4_4
Finished domain zf-C4
Finished domain zf-CCCH
Finished domain zf-CCHC
Finished domain zf-CXXC
Finished domain zf-DHHC
Finished domain zf-FCS
Finished domain zf-H2C2_2
Finished domain zf-H2C2_5
Finished domain zf-HC5HC2H
Finished domain zf-HC5HC2H_2
Finished domain zf-MYND
Finished domain zf-NF-X1
Finished domain zf-RING_11
Finished domain zf-RING_2
Finished domain zf-RING_5
Finished domain zf-RING_UBOX
Finished domain zf-RanBP
Finished domain zf-TRAF
Finished domain zf-UBP
Finished domain zf-met
Finished domain zf-rbx1


### Run SPIDER2 program

For each gene, generate a .seq file and make the call:

    run_local filename.seq

Alternately, use the code in run_spider.py and spider2.py for larger scale jobs with many genes.

### Check for genes that crashed and rerun as needed

Skip these next two cells if everything worked perfectly!

In [3]:
dirfiles = !ls -t $'./SPIDER2/protein_seq_results/out'

# Only look at new out jobs
threshold = 4018018

for f in dirfiles:
    # Only check out files
    if f[len(f)-3:len(f)] != 'out' or int(f.split("-")[1].split(".")[0]) <= threshold:
        continue
    # Check for print statement at end
    out = open(spider_dir+'/out/'+f).read()
    if not 'SUCCESS!!!' in out or 'Error' in out:
        print(f)

In [21]:
# Rerunning might take a bit, so only do it if you're sure
import subprocess
rerun_list = ['ENSG00000157766.11-ENSP00000453581.1']

for term in rerun_list:
    gene = term.split("-")[0]
    prot = term.split("-")[1]
    # Save sequence to file
    filename = files_dir+"/SPIDER/seq/"+term.replace('.','-')+'.seq'
    with open(filename,'w') as f:
        f.write(all_genes[gene][prot].replace('*',''))
    f.close

    # Run SPIDER2
    subprocess.call([files_dir+'/SPIDER/misc/run_local.sh',filename])

### Process the raw output files into dictionaries

In [5]:
# Extract features from files
def read_file(filepath,output_dict):
    # Differentiate between files with same feature names
    tokens = filepath.split('.')
    extension = tokens[len(tokens)-1]
    
    # Read files and save to dict
    with open(filepath,'r') as f:
        for line in f.readlines():
            # Parse header
            if line[0] == '#':
                tokens = line.split()
                features = tokens[1:len(tokens)]
                continue

            # Parse lines
            tokens = line.split()
            if not int(tokens[0]) in output_dict.keys():
                output_dict[int(tokens[0])] = {}
            for i in range(0,len(features)):
                output_dict[int(tokens[0])][extension+"_"+features[i]] = tokens[i+1]


# Load genes to aa sequence mapping for each domain
with open(files_dir+"SPIDER/gene_dict.pik", 'rb') as handle:
    gene_dict = pickle.load(handle)
    
    
# For testing
count = []


num_excluded = 0
num_geneprot = 0
no_spd3 = []
no_hsa2 = []
no_hsb2 = []

# Read files and save to dict
for domain in gene_dict:
    # Get the canonic protein id for each gene for the domain
    with open(files_dir+"4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    secondary_struct_dict = defaultdict(dict)
    
    for gene in gene_dict[domain]:
        if gene not in all_genes:
            num_excluded += 1
            continue

        secondary_struct_dict[gene] = {}
        
        # Some files also include the protein name, so check if this is the case first
        prot = canonic_protein[gene]
        prefix = gene.replace('.','-') + "-" + prot.replace('.','-')
        if os.path.isfile(spider_dir+"/hsa2/"+prefix+".hsa2"):
            num_geneprot += 1
            count.append(gene.replace('.','-') + "." + prot.replace('.','-'))
        if not os.path.isfile(spider_dir+"/pssm/"+prefix+".pssm"):
            prefix = gene.replace('.','-')
        
        try:
            read_file(spider_dir+"/spd3/"+prefix+".spd3",secondary_struct_dict[gene])
        except IOError:
            no_spd3.append(gene)
            
        try:
            read_file(spider_dir+"/hsa2/"+prefix+".hsa2",secondary_struct_dict[gene])
        except IOError:
            no_hsa2.append(gene)
            
        try:
            read_file(spider_dir+"/hsb2/"+prefix+".hsb2",secondary_struct_dict[gene])
        except IOError:
            no_hsb2.append(gene)
        
    # Save to file
    #with open(spider_dir+'/domain_dicts/'+domain+'_secondary_struct_dict.pik', 'wb') as handle:
    #    pickle.dump(secondary_struct_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# Check if file missing
print(num_excluded)
print(len(no_spd3))
print(len(no_hsa2))
print(len(no_hsb2))

0
0
0
0
