## SPIDER2 Secondary Structure

Creates dictionaries with SPIDER2 characteristics for each domain

SPIDER2 with HSE can be downloaded from http://sparks-lab.org/index.php/Main/Downloads.
Follow the instructions in the README to setup.

### Other Requirements:
1. List of domains
2. List of genes associated with each domain and sequences

### Running:
Run the cells in order

### Output:
A dictionary for each domain where each position has a solvent accessibility score, contact number, and predicted secondary structure, along with the other outputs of SPIDER2

### Get list of genes for all domains

In [1]:
import os
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import cPickle as pickle

# Getting path
files_dir = '/Users/davidandrewtodd/summer_research/ExAC/' 
spider_dir = '/Users/davidandrewtodd/summer_research/ExAC/SPIDER/SPIDER2/protein_seq_results'

# Reading the list of filtered domains
with open(files_dir+"5.domains_stats/pfam-v31/filtered10_list.pik", 'rb') as handle:
    filtered_domains_list10 = pickle.load(handle)
filtered_domains_list10.sort()

# Map of all genes to protein seqs
with open(files_dir+"3.parse_HMMER/canonic_prot_seq/pfam-v31/all_domains_genes_prot_seq.pik", 'rb') as handle:
    all_genes = pickle.load(handle)
    
# Diff between v30 and v31
with open(spider_dir+"/../../v30-v31_diff_domains_genes_prot_seq.pik", 'rb') as handle:
    diff_genes = pickle.load(handle)
    
chromosome_names = ['X','Y']
for i in range(1,23):
    chromosome_names.append(str(i))

In [2]:
# Map domain to sequences of canonic proteins

domain_to_protein_seq_dict = defaultdict(dict)

genes_list = []

for domain_name in filtered_domains_list10:
    #Get the canonic protein id
    with open(files_dir+"4.parse_Uniprot/domains_canonic_prot/pfam-v31/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in canonic_protein:
        # Restrict to diff dictionary
        if not gene in diff_genes:
            continue
        # No need to process a gene twice
        if gene in domain_to_protein_seq_dict[domain_name]:
            continue
        domain_to_protein_seq_dict[domain_name][gene] = {}
        
        # Get sequence
        protein = canonic_protein[gene]
        
        for prot in diff_genes[gene].keys():
            domain_to_protein_seq_dict[domain_name][gene][prot] = diff_genes[gene][prot].replace('*','').replace('-','').replace('X','').replace('.',' ').upper()
        
        genes_list.append(gene)
    #print("Finished domain "+domain_name)

# Saving to file
with open(files_dir+'SPIDER/domain_to_protein_seq_dict.pik', 'wb') as handle:
    pickle.dump(domain_to_protein_seq_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Run SPIDER2 program

After downloading SPIDER from the link above, generate a .seq file in the seq/ directory for each gene, and make the call:

    run_local filename.seq
    
from the directory that contains run_local.sh

See the rerun code below as an example.

-------------

Alternately, for running on a cluster, move the dictionary domain_to_protein_seq_dict created above to a directory that contains run_spider.py and spider2.py, in addition to the same file structure as the local environment described above (that is with the SPIDER code). Make the call:

    python run_spider.py
    
to submit all necessary jobs.

### Check for genes that crashed and rerun as needed

Skip these next two cells if everything worked perfectly!

In [3]:
dirfiles = !ls -t $'./SPIDER2/protein_seq_results/out'

# Only look at new out jobs
threshold = 4018018

for f in dirfiles:
    # Only check out files
    if f[len(f)-3:len(f)] != 'out' or int(f.split("-")[1].split(".")[0]) <= threshold:
        continue
    # Check for print statement at end
    out = open(spider_dir+'/out/'+f).read()
    if not 'SUCCESS!!!' in out or 'Error' in out:
        print(f)

In [9]:
# Rerunning might take a bit, so only do it if you're sure
import subprocess
rerun_list = ['ENSG00000085276.13-ENSP00000417899.1']

for term in rerun_list:
    gene = term.split("-")[0]
    prot = term.split("-")[1]
    # Save sequence to file
    filename = files_dir+"/SPIDER/seq/"+term.replace('.','-')+'.seq'
    with open(filename,'w') as f:
        f.write(diff_genes[gene][prot].replace('*',''))
    f.close

    # Run SPIDER2
    subprocess.call([files_dir+'/SPIDER/misc/run_local.sh',filename])

### Process the raw output files into dictionaries

In [37]:
# Extract features from files
def read_file(filepath,output_dict):
    # Differentiate between files with same feature names
    tokens = filepath.split('.')
    extension = tokens[len(tokens)-1]
    
    # Read files and save to dict
    with open(filepath,'r') as f:
        for line in f.readlines():
            # Parse header
            if line[0] == '#':
                tokens = line.split()
                features = tokens[1:len(tokens)]
                continue

            # Parse lines
            tokens = line.split()
            if not int(tokens[0]) in output_dict.keys():
                output_dict[int(tokens[0])] = {}
            for i in range(0,len(features)):
                output_dict[int(tokens[0])][extension+"_"+features[i]] = tokens[i+1]


# Load genes to aa sequence mapping for each domain
with open(files_dir+"SPIDER/domain_to_protein_seq_dict.pik", 'rb') as handle:
    domain_to_protein_seq_dict = pickle.load(handle)
    
    
# For testing
num_excluded = 0
no_spd3 = []
no_hsa2 = []
no_hsb2 = []

# Read files and save to dict
for domain in domain_to_protein_seq_dict:
    secondary_struct_dict = defaultdict(dict)
    
    for gene in domain_to_protein_seq_dict[domain]:
        if gene not in all_genes:
            num_excluded += 1
            continue

        secondary_struct_dict[gene] = {}
        
        # Should only be one protein
        prot = domain_to_protein_seq_dict[domain][gene].keys()[0]
            
        # Some files also include the protein name, so check if this is the case first
        prefix = gene.replace('.','-') + "-" + prot.replace('.','-')
        if not os.path.isfile(spider_dir+"/pssm/"+prefix+".pssm"):
            prefix = gene.replace('.','-')

        try:
            read_file(spider_dir+"/spd3/"+prefix+".spd3",secondary_struct_dict[gene])
        except IOError:
            no_spd3.append(gene)

        try:
            read_file(spider_dir+"/hsa2/"+prefix+".hsa2",secondary_struct_dict[gene])
        except IOError:
            no_hsa2.append(gene)

        try:
            read_file(spider_dir+"/hsb2/"+prefix+".hsb2",secondary_struct_dict[gene])
        except IOError:
            no_hsb2.append(gene)
            
    print(secondary_struct_dict.keys())
                
    # Check if dict for domain already present
    try:
        with open(spider_dir+'/domain_dicts/'+domain+'_secondary_struct_dict.pik', 'rb') as handle:
            old_dict = pickle.load(handle)
        print(old_dict.keys())
        # Overwrites old info if overlap
        old_dict.update(secondary_struct_dict)
        secondary_struct_dict = old_dict
    except IOError:
        pass
        
    # Save to file
    with open(spider_dir+'/domain_dicts_diff/'+domain+'_secondary_struct_dict.pik', 'wb') as handle:
        pickle.dump(secondary_struct_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# Check if file missing
print(num_excluded)
print(len(no_spd3))
print(len(no_hsa2))
print(len(no_hsb2))

0
0
0
0
