diff --git a/CHANGES.txt b/CHANGES.md similarity index 100% rename from CHANGES.txt rename to CHANGES.md diff --git a/spice/data_set.py b/spice/data_set.py index 4ca28d2..06bb67f 100644 --- a/spice/data_set.py +++ b/spice/data_set.py @@ -58,7 +58,7 @@ def read_data_source(self, src_id, data_path, mapping_file=None): assert(self.proteins) ds = self.ds_dict[src_id] ds.read_data(data_path, mapping_file=mapping_file, - object_ids=self.get_protein_ids()) + object_ids=self.get_protein_ids()) self.propagate_data_source_data(ds) # TODO mapping? like in the function above @@ -97,7 +97,7 @@ def set_mutation_data(self, mutation_data): pdb_id, pdb_i) in mutation_data: if(pid in protein_dict.keys()): protein = protein_dict[pid] - + MissenseMutation(protein, pos, fr, to, label, pep, pep_i, codons, codon_fr, codons_to, pdb_id, pdb_i) ''' @@ -113,8 +113,8 @@ def set_mutation_data(self, mutation_data): mismut_list = list(mismut_tuple) mismut_list[0] = protein mismut_tuple = tuple(mismut_list) - - # create mutation object, which will imediately linked to the + + # create mutation object, which will imediately linked to the # protein object MissenseMutation.from_tuple(mismut_tuple) @@ -166,7 +166,7 @@ def mutation_f(self): class DataSource(): def __init__(self, data_set, uid, name, read_func, write_func, - set_data_func, check_funcs, data_path, mapping_file): + set_data_func, check_funcs, data_path, mapping_file): # callback data set self.data_set = data_set @@ -218,7 +218,7 @@ def read_data(self, data_path, mapping_file=None, object_ids=None): # get mapping from our uniprot ids to data source ids if(mapping_file): object_to_data = [t for t in file_io.read_tuple_list(mapping_file, - (str, str))] + (str, str))] # 'unzip' into list of mapped ids and list of data file names uni_othe_dict = dict(object_to_data) @@ -234,7 +234,7 @@ def read_data(self, data_path, mapping_file=None, object_ids=None): # set the data self.set_data(data, data_mapping=uni_othe_dict, - object_ids=object_ids) + object_ids=object_ids) # or from a single data file else: @@ -245,7 +245,7 @@ def read_data(self, data_path, mapping_file=None, object_ids=None): data_dict = dict(data) data = [(i, data_dict[uni_othe_dict[i]]) for i in object_ids] self.set_data(data, data_mapping=uni_othe_dict, - object_ids=object_ids) + object_ids=object_ids) else: self.set_data(data, object_ids=object_ids) @@ -265,7 +265,8 @@ def set_data(self, data, data_mapping=None, object_ids=None): if (any(map(func, items_to_check))): self.data = None raise ValueError('Error in %s data, contains item that %s.' % - (self.name.lower(), ' '.join(func.__name__.split('_')))) + (self.name.lower(), + ' '.join(func.__name__.split('_')))) def get_data_path(self): return(os.path.join(self.root_dir, self.data_path)) @@ -325,6 +326,7 @@ def load(self): def available(self): return True if self.data else False + # TODO store this in configuration file class DataSourceFactory(object): @@ -340,72 +342,76 @@ def __init__(self): # secondary structure sequences corresponds to the protein sequence # lengths. self.data_sources = { - 'prot_seq': ('Protein sequence', - file_io.read_fasta, file_io.write_fasta, - Protein.set_protein_sequence, - [ - sequtil.is_empty, - sequtil.is_not_an_amino_acid_sequence - ], 'protein.fsa', None), - 'orf_seq': ('ORF sequence', - file_io.read_fasta, file_io.write_fasta, - Protein.set_orf_sequence, - [ - sequtil.is_empty, - sequtil.is_not_a_nucleotide_sequence - ], 'orf.fsa', 'uni_orf.map'), - 'ss_seq': ('Secondary structure sequence', - file_io.read_fasta, file_io.write_fasta, - Protein.set_ss_sequence, - [ - sequtil.is_empty, - sequtil.is_not_a_sec_struct_sequence - ], 'ss.fsa', 'uni_ss.map'), - 'sa_seq': ('Solvent accessible sequence', - file_io.read_fasta, file_io.write_fasta, - Protein.set_sa_sequence, - [ - sequtil.is_empty, - sequtil.is_not_a_solv_access_sequence - ], 'sa.fsa', 'uni_sa.map'), - 'prot_struct': ('protein structure', - file_io.read_pdb_dir, file_io.write_pdb_dir, - Protein.set_protein_structure, - [ - ], os.path.join('structure_data', 'pdb'), 'uni_pdb.map'), - 'residue_rasa': ('residue relative accessible surface area', - file_io.read_rasa_dir, file_io.write_rasa_dir, - Protein.set_rasa, - [ - ], os.path.join('structure_data', 'rasa'), - 'uni_rasa.map'), - #'residue_rank': ('protein residue ranking', - # file_io.read_residue_rank_dir, - # file_io.write_residue_rank_dir, - # Protein.set_msa_data, - # [ - # ], os.path.join('msa_data', 'residue_rank'), - # 'uni_rank.map'), - 'msa': ('Multiple sequence alignment with homologous proteins', - file_io.read_msa_dir, - file_io.write_msa_dir, - Protein.set_msa, - [ - ], os.path.join('msa_data', 'msa'), - 'uni_msa.map'), - 'pfam': ('protein family data', - file_io.read_pfam, file_io.write_pfam, - Protein.set_pfam_annotations, - [], 'pfam.txt', None), - 'flex': ('backbone dynamics data', - file_io.read_flex, file_io.write_flex, - Protein.set_backbone_dynamics, - [], 'flex.txt', None), - 'interaction': ('interaction counts data', - file_io.read_interaction_counts, - file_io.write_interaction_counts, - Protein.set_interaction_counts, - [], 'interaction.txt', None) + 'prot_seq': ( + 'Protein sequence', + file_io.read_fasta, file_io.write_fasta, + Protein.set_protein_sequence, + [ + sequtil.is_empty, + sequtil.is_not_an_amino_acid_sequence + ], 'protein.fsa', None), + 'orf_seq': ( + 'ORF sequence', + file_io.read_fasta, file_io.write_fasta, + Protein.set_orf_sequence, + [ + sequtil.is_empty, + sequtil.is_not_a_nucleotide_sequence + ], 'orf.fsa', 'uni_orf.map'), + 'ss_seq': ( + 'Secondary structure sequence', + file_io.read_fasta, file_io.write_fasta, + Protein.set_ss_sequence, + [ + sequtil.is_empty, + sequtil.is_not_a_sec_struct_sequence + ], 'ss.fsa', 'uni_ss.map'), + 'sa_seq': ( + 'Solvent accessible sequence', + file_io.read_fasta, file_io.write_fasta, + Protein.set_sa_sequence, + [ + sequtil.is_empty, + sequtil.is_not_a_solv_access_sequence + ], 'sa.fsa', 'uni_sa.map'), + 'prot_struct': ( + 'protein structure', + file_io.read_pdb_dir, file_io.write_pdb_dir, + Protein.set_protein_structure, + [], + os.path.join('structure_data', 'pdb'), + 'uni_pdb.map'), + 'residue_rasa': ( + 'residue relative accessible surface area', + file_io.read_rasa_dir, file_io.write_rasa_dir, + Protein.set_rasa, + [], + os.path.join('structure_data', 'rasa'), + 'uni_rasa.map'), + 'msa': ( + 'Multiple sequence alignment with homologous proteins', + file_io.read_msa_dir, + file_io.write_msa_dir, + Protein.set_msa, + [ + ], os.path.join('msa_data', 'msa'), + 'uni_msa.map'), + 'pfam': ( + 'protein family data', + file_io.read_pfam, file_io.write_pfam, + Protein.set_pfam_annotations, + [], 'pfam.txt', None), + 'flex': ( + 'backbone dynamics data', + file_io.read_flex, file_io.write_flex, + Protein.set_backbone_dynamics, + [], 'flex.txt', None), + 'interaction': ( + 'interaction counts data', + file_io.read_interaction_counts, + file_io.write_interaction_counts, + Protein.set_interaction_counts, + [], 'interaction.txt', None) } # make sure that all ids are in the ids list diff --git a/spice/featext.py b/spice/featext.py index 1e5f995..ac35770 100644 --- a/spice/featext.py +++ b/spice/featext.py @@ -548,7 +548,7 @@ def available_protein_featcat_ids(self): ''' featcat_ids = set() - + for f in self.fm_protein.feature_ids: parts = f.split('_') if(len(parts) == 2): diff --git a/spice/featmat.py b/spice/featmat.py index 376931d..510c10a 100644 --- a/spice/featmat.py +++ b/spice/featmat.py @@ -311,7 +311,8 @@ def remove_features(self, feature_ids): del self.feature_matrix else: # otherwise delete columns from feature matrix - self._feature_matrix = numpy.delete(self.feature_matrix, fis, 1) + self._feature_matrix = numpy.delete(self.feature_matrix, + fis, 1) # and delete feature ids and names for fid in feature_ids: @@ -355,8 +356,8 @@ def add_custom_features(self, feature_matrix): last_cust_feat = sorted(cust_feats)[-1] print last_cust_feat print len(self.CUSTOM_FEAT_PRE) + 1 - new_cust_feat_i =\ - int(last_cust_feat[(len(self.CUSTOM_FEAT_PRE)):]) + 1 + new_cust_feat_i = int( + last_cust_feat[(len(self.CUSTOM_FEAT_PRE)):]) + 1 featvec_id = '%s%i' % (self.CUSTOM_FEAT_PRE, new_cust_feat_i) feat_ids = ['%s_%i' % (featvec_id, i) for i in xrange(num_feat)] @@ -425,7 +426,7 @@ def class_indices(self, labeling_name, class_ids): return sorted([labeling.class_names.index(c) for c in class_ids]) def get_custom_features(self): - ''' + ''' This function returns the available custom feature vector ids. Returns a dictionary with the custom feature vector ids as keys and the @@ -471,7 +472,7 @@ def get_dataset(self, feat_ids=None, labeling_name=None, class_ids=None, # map target to use 0,1,2,... as labels target_map = dict(zip(class_is, range(len(class_is)))) - + # targets are floats because liblinear classification wants this... target = numpy.array([float(target_map[t]) for t in target]) else: @@ -722,7 +723,7 @@ def save_histogram(self, feat_id, labeling_name, class_ids=None, def save_scatter(self, feat_id0, feat_id1, labeling_name=None, class_ids=None, colors=None, img_format='png', - root_dir='.', feat0_pre=None, feat1_pre=None, + root_dir='.', feat0_pre=None, feat1_pre=None, standardized=False): try: @@ -761,7 +762,7 @@ def save_scatter(self, feat_id0, feat_id1, labeling_name=None, if not(os.path.exists(d)): os.makedirs(d) out_f = os.path.join(d, 'scatter.%s' % (img_format)) - + if(standardized): # standardize data NOTE that fm is standardized before the objects # are sliced out!!! @@ -887,7 +888,7 @@ class Labeling(object): #def __init__(self, name, feature_matrix): def __init__(self, name, object_ids, labels, class_names): ''' - Is it really necesary to retain the order of the object ids? Why not + Is it really necesary to retain the order of the object ids? Why not initiate with a dict? ''' diff --git a/spice/mutation.py b/spice/mutation.py index 10cfe3d..7e52489 100644 --- a/spice/mutation.py +++ b/spice/mutation.py @@ -88,8 +88,8 @@ def pdb_resnum(self): def set_protein_data(self, protein, position, aa_from, aa_to): if not(protein.protein_sequence[position - 1] == aa_from): - raise ValueError('Amino acid %s not ' % (aa_from) +\ - 'on position %i ' % (position) +\ + raise ValueError('Amino acid %s not ' % (aa_from) + + 'on position %i ' % (position) + 'in protein %s.' % (protein.pid)) self._protein = protein @@ -112,7 +112,7 @@ def set_peptide_data(self, aa_pep, aa_pep_i): if(self.protein is None): raise ValueError('Protein data must be set.') if not(aa_pep[aa_pep_i] == self.aa_from): - raise ValueError('Amino acid on aa_pep_i in aa_pep does not ' +\ + raise ValueError('Amino acid on aa_pep_i in aa_pep does not ' + 'correspond to aa_from.') self._aa_pep = aa_pep @@ -209,7 +209,7 @@ def from_tuple(cls, tuple): mismut.set_peptide_data(tuple[5], tuple[6]) mismut.set_codon_data(tuple[7], tuple[8], tuple[9]) mismut.set_struct_data(tuple[10], tuple[11]) - return mismut + return mismut def tuple_representation(self): return (self.protein.pid, self.position, self.aa_from, self.aa_to, @@ -248,8 +248,8 @@ def signal_diff(self, scale, feature_ids=False): return (ids, names) def signal_auc(self, scale, env_window=21, sig_window=9, edge=1.0, - threshold=1.5, below_threshold=False, - feature_ids=False): + threshold=1.5, below_threshold=False, + feature_ids=False): # TODO scale num_scales = 19 diff --git a/spice/project_management.py b/spice/project_management.py index 828ede6..d6f3d75 100644 --- a/spice/project_management.py +++ b/spice/project_management.py @@ -253,13 +253,11 @@ def parse_classify_job_files(self, cl_id): assert(tokens[1] == '-f') assert(tokens[3] == '-c') cid = os.path.basename( - os.path.dirname( - os.path.dirname(tokens[4]))) + os.path.dirname(os.path.dirname(tokens[4]))) if(cid == cl_id): data_set = os.path.basename( - os.path.dirname( - os.path.dirname(tokens[2]))) + os.path.dirname(os.path.dirname(tokens[2]))) data_set_list.append(data_set) status_dirs[status] = data_set_list @@ -849,7 +847,6 @@ def run_classify(self, cl_id, project_id): settings_dict = self.get_classifier_settings(cl_id) feature_ids = settings_dict['feature_names'] - feature_cats = set() for f in feature_ids: fparts = f.split('_') @@ -880,7 +877,7 @@ def run_classify(self, cl_id, project_id): time.sleep(2) # store path to feature matrix dir - fm_dir = self.fm_dir + fm_dir = self.fm_dir # SWITCH BACK TO ORIGINAL PROJECT self.set_project(prev_proj) @@ -893,7 +890,7 @@ def run_classify(self, cl_id, project_id): # output files progress_f = os.path.join(out_d, 'progress.txt') error_f = os.path.join(out_d, 'error.txt') - + # create the list of options for the classification command options = [ '-f %s' % (fm_dir), @@ -908,4 +905,3 @@ def run_classify(self, cl_id, project_id): fout.write('%s\n' % (cmd)) fout.write('%s\n' % (progress_f)) fout.write('%s\n' % (error_f)) -