In [None]:
import os
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import cPickle as pickle

# pfam version
PFAM = 'pfam-v31'

# Getting path
curr_dir = os.getcwd()
files_dir = curr_dir+'/../' 
spider_dir = curr_dir+'/SPIDER2/protein_seq_results/'

# Reading the list of filtered domains
with open(files_dir+"5.domains_stats/"+PFAM+"/filtered10_list.pik", 'rb') as handle:
    filtered_domains_list10 = pickle.load(handle)
filtered_domains_list10.sort()

# Map of all genes to protein seqs
with open(files_dir+"3.parse_HMMER/canonic_prot_seq/"+PFAM+"/all_domains_genes_prot_seq.pik", 'rb') as handle:
    all_genes = pickle.load(handle)
    
# Diff between v30 and v31
with open(spider_dir+"/../../v30-v31_diff_domains_genes_prot_seq.pik", 'rb') as handle:
    diff_genes = pickle.load(handle)
    
# The dictionary of genes -> proteins -> sequence that will be run
sequence_dict = diff_genes

In [None]:
# Map domain to sequences of canonic proteins

domain_to_protein_seq_dict = defaultdict(dict)

for domain_name in filtered_domains_list10:
    #Get the canonic protein id
    with open(files_dir+"4.parse_Uniprot/domains_canonic_prot/"+PFAM+"/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in canonic_protein:
        # Restrict to diff dictionary
        if not gene in sequence_dict:
            continue
        # No need to process a gene twice
        if gene in domain_to_protein_seq_dict[domain_name]:
            continue
        domain_to_protein_seq_dict[domain_name][gene] = {}
        
        # Get sequence
        protein = canonic_protein[gene]
        
        for prot in sequence_dict[gene].keys():
            domain_to_protein_seq_dict[domain_name][gene][prot] = sequence_dict[gene][prot].replace('*','').replace('-','').replace('X','').replace('.',' ').upper()

In [None]:
# Extract features from files
def read_file(filepath,output_dict):
    # Differentiate between files with same feature names
    tokens = filepath.split('.')
    extension = tokens[len(tokens)-1]
    
    # Read files and save to dict
    with open(filepath,'r') as f:
        for line in f.readlines():
            # Parse header
            if line[0] == '#':
                tokens = line.split()
                features = tokens[1:len(tokens)]
                continue

            # Parse lines
            tokens = line.split()
            if not int(tokens[0]) in output_dict.keys():
                output_dict[int(tokens[0])] = {}
            for i in range(0,len(features)):
                output_dict[int(tokens[0])][extension+"_"+features[i]] = tokens[i+1]


# Load genes to aa sequence mapping for each domain
with open(files_dir+"SPIDER/domain_to_protein_seq_dict.pik", 'rb') as handle:
    domain_to_protein_seq_dict = pickle.load(handle)
    
    
# For testing
num_excluded = 0
no_spd3 = []
no_hsa2 = []
no_hsb2 = []

# Read files and save to dict
for domain in domain_to_protein_seq_dict:
    secondary_struct_dict = defaultdict(dict)
    
    for gene in domain_to_protein_seq_dict[domain]:
        if gene not in all_genes:
            num_excluded += 1
            continue

        secondary_struct_dict[gene] = {}
        
        # Should only be one protein
        prot = domain_to_protein_seq_dict[domain][gene].keys()[0]
            
        # Some files also include the protein name, so check if this is the case first
        prefix = gene.replace('.','-') + "-" + prot.replace('.','-')
        if not os.path.isfile(spider_dir+"/pssm/"+prefix+".pssm"):
            prefix = gene.replace('.','-')

        try:
            read_file(spider_dir+"/spd3/"+prefix+".spd3",secondary_struct_dict[gene])
        except IOError:
            no_spd3.append(gene)

        try:
            read_file(spider_dir+"/hsa2/"+prefix+".hsa2",secondary_struct_dict[gene])
        except IOError:
            no_hsa2.append(gene)

        try:
            read_file(spider_dir+"/hsb2/"+prefix+".hsb2",secondary_struct_dict[gene])
        except IOError:
            no_hsb2.append(gene)
                
    # Check if dict for domain already present
    try:
        with open(spider_dir+'/domain_dicts/'+domain+'_secondary_struct_dict.pik', 'rb') as handle:
            old_dict = pickle.load(handle)
        print(old_dict.keys())
        # Overwrites old info if overlap
        old_dict.update(secondary_struct_dict)
        secondary_struct_dict = old_dict
    except IOError:
        pass
        
    # Save to file
    with open(spider_dir+'/domain_dicts_diff/'+domain+'_secondary_struct_dict.pik', 'wb') as handle:
        pickle.dump(secondary_struct_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Check if file missing
print(num_excluded)
print(len(no_spd3))
print(len(no_hsa2))
print(len(no_hsb2))