In [10]:
import pandas as pd
import numpy as np
import pylab as pl
import pickle
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [11]:
curr_dir = !pwd
pfam_version = "31"
domains_path = curr_dir[0]+"/../3.parse_HMMER/hmm_domains/pfam-v"+pfam_version+"/"
canonic_prot_path = curr_dir[0]+"/../4.parse_Uniprot/domains_canonic_prot/pfam-v"+pfam_version+"/"

In [12]:
def count_domain_instances(domain_gene_table, count_overlaps=True):
    
    if (count_overlaps):
        return domain_gene_table.shape[0]
    
    else:
        instance_counter = 0
        last_target_end = 0

        for i, row in domain_gene_table.iterrows():
            curr_target_start = int(row["TargetStart"])
            curr_target_end = int(row["TargetEnd"])
            if (curr_target_start > last_target_end):
                instance_counter += 1
                last_target_end = curr_target_end
            #If the instance overlpas the previous one
            else: 
                #Updating to the smaller traget end
                if (curr_target_end > last_target_end):
                    last_target_end = curr_target_end
                #Continue without incrememnting the counter
                continue 
        
        return instance_counter

### Counting the number of genes and number of instances per domain

In [None]:
domains_stats = {}
domains_filenames = !ls $domains_path

for dom_filename in domains_filenames:
    curr_domain_stats = []
    
    domain_sym = dom_filename[:dom_filename.find(".")]
    domain_data = pd.read_csv(domains_path+dom_filename, sep='\t', index_col=0)
    domain_ens_genes = (domain_data["gene"]).unique()
    
    with open(canonic_prot_path+domain_sym+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    domain_intance_num = 0
    for ens_gene in domain_ens_genes:
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        canonic_prot_t = canonic_prot[:canonic_prot.find(".")] #Trimming the ".#" at the end
        domain_gene_table = domain_data[domain_data["prot"] == canonic_prot]
        
        #Count the number of domain instances in this gene
        domain_gene_table = domain_gene_table.sort_values(by=["TargetStart", "BitScore"], ascending=[True, False])
        gene_intance_num = count_domain_instances(domain_gene_table, count_overlaps=True)
        domain_intance_num += gene_intance_num
    
    #Saving domains stats:
    curr_domain_stats.append(len(domain_ens_genes)) # No. of genes
    curr_domain_stats.append(domain_intance_num) #No. of domain instances
    
    #Updating the big dict
    domains_stats[domain_sym] = curr_domain_stats

In [6]:
#Saving in dictionary format
with open(curr_dir[0]+"/pfam-v"+pfam_version+"/domains_stats_dict.pik", 'wb') as handle:
    pickle.dump(domains_stats, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
#Saving in data-frame foramts
domains_stats_df = pd.DataFrame.from_dict(domains_stats,orient='index')
domains_stats_df.columns = ["genes", "instances"]
domains_stats_df = domains_stats_df.sort_values(by=["instances", "genes"], ascending=[False, False])
domains_stats_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domains_stats_df.csv", sep='\t')

In [4]:
domains_stats_df = pd.read_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domains_stats_df.csv", sep='\t', index_col=0)

### Counting the number of genes covered

In [5]:
domain_all_names = domains_stats_df.index.tolist()
domain_all_genes = []

for dom_name in domain_all_names:
    domain_data = pd.read_csv(domains_path+dom_name+".csv", sep='\t', index_col=0)
    domain_ens_genes = (domain_data["gene"]).unique()
    domain_all_genes.extend(domain_ens_genes)

domain_all_genes = set(domain_all_genes)
print len(domain_all_genes)

19823


In [6]:
domain_stats_df_10 = pd.read_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/filtered10_domains_df.csv", sep='\t', index_col=0)

In [8]:
domain_10_names = domain_stats_df_10.index.tolist()
domain_10_genes = []

for dom_name in domain_10_names:
    domain_data = pd.read_csv(domains_path+dom_name+".csv", sep='\t', index_col=0)
    domain_ens_genes = (domain_data["gene"]).unique()
    domain_10_genes.extend(domain_ens_genes)

domain_10_genes = set(domain_10_genes)
print len(domain_10_genes)

13254


In [12]:
all_domains_list = domains_stats_df.index
with open(curr_dir[0]+"/pfam-v"+pfam_version+"/all_domains_list.pik", 'wb') as handle:
    pickle.dump(all_domains_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Creating datasets of filtering by number of instances

In [19]:
#Filter domains according to 100 instances
instance_threshold = 10
filtered_domains_df = domains_stats_df[domains_stats_df["instances"] > instance_threshold]

#Save to file
filtered_domains_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/filtered"+str(instance_threshold)+"_domains_df.csv", sep='\t')

In [20]:
#Saving a list of the filtered domains names
filtered_domains_list = []
for domain in domains_stats.keys():
    if (domains_stats[domain][1] > instance_threshold):
        filtered_domains_list.append(domain)

#Pickeling
with open(curr_dir[0]+"/pfam-v"+pfam_version+"/filtered"+str(instance_threshold)+"_list.pik", 'wb') as handle:
    pickle.dump(filtered_domains_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
#Counting the number of genes covered in the filtered datasets
domain_filtered_names = filtered_domains_df.index.tolist()
domain_filtered_genes = []

for dom_name in domain_filtered_names:
    domain_data = pd.read_csv(domains_path+dom_name+".csv", sep='\t', index_col=0)
    domain_ens_genes = (domain_data["gene"]).unique()
    domain_filtered_genes.extend(domain_ens_genes)

domain_filtered_genes = set(domain_filtered_genes)
print len(domain_filtered_genes)

13254


In [5]:
with open(curr_dir[0]+"/../binding_score/domains_binding_dict.pik", 'rb') as handle:
    binding_scores_dict = pickle.load(handle)

In [59]:
domains_stats_df[domains_stats_df["instances"] >= 100].index

Index([u'zf-C2H2', u'zf-H2C2_2', u'ig', u'Cadherin', u'I-set', u'7tm_1',
       u'fn3', u'Ig_3', u'Ig_2', u'EGF', u'Ank', u'Collagen', u'Ank_4',
       u'LRR_8', u'Ank_5', u'Pkinase', u'Ank_3', u'Ank_2', u'Pkinase_Tyr',
       u'7tm_4', u'EGF_CA', u'KRAB', u'V-set', u'RRM_1', u'Sushi', u'WD40',
       u'EGF_2', u'DUF1220', u'zf-C3HC4', u'Homeobox', u'Spectrin', u'Kelch_1',
       u'PH', u'Keratin_B2_2', u'SH3_2', u'PDZ', u'SH3_1', u'C1-set', u'cEGF',
       u'zf-RING_UBOX', u'zf-RING_2', u'zf-C3HC4_2', u'SH3_9', u'zf-C2H2_6',
       u'zf-C2H2_4', u'Ldl_recept_a', u'zf-C3HC4_3', u'C2', u'Laminin_EGF',
       u'hEGF', u'zf-RING_5', u'LRR_4', u'Ion_trans', u'EF-hand_1', u'TSP_1',
       u'LIM', u'Nebulin', u'Ras', u'Mito_carr', u'FXa_inhibition', u'Roc',
       u'CUB', u'BTB', u'7TM_GPCR_Srsx', u'Lectin_C', u'Kelch_3',
       u'Ldl_recept_b', u'Arm', u'Trypsin', u'zf-C3HC4_4', u'TPR_1', u'IQ',
       u'Helicase_C', u'Plectin', u'Kelch_6', u'Kelch_2', u'EF-hand_7',
       u'zf-B_box', u'PH

### Training dataset stats

In [8]:
# Reading training dataset
datafile_date = "08.06.18"
input_path = curr_dir[0]+"/../10.Prediction/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
training_domains = features_all["domain_name"].unique().tolist()

In [13]:
#Counting number of genes covered
training_domain_genes = []

for dom_name in training_domains:
    domain_data = pd.read_csv(domains_path+dom_name+".csv", sep='\t', index_col=0)
    domain_ens_genes = (domain_data["gene"]).unique()
    training_domain_genes.extend(domain_ens_genes)

training_domain_genes = set(training_domain_genes)
print len(training_domain_genes)

9515
