Skip to content
This repository
Browse code

Added beginnings of a martel grammer for hmmpfam

  • Loading branch information...
commit 21c051398189a34a54cd2c82cb520cf216fee854 1 parent b3309fd
authored February 16, 2004

Showing 1 changed file with 217 additions and 0 deletions. Show diff stats Hide diff stats

  1. 217  Bio/expressions/hmmpfam.py
217  Bio/expressions/hmmpfam.py
... ...
@@ -0,0 +1,217 @@
  1
+"""Martel expression for the hmmpfam database search program in hmmer.
  2
+
  3
+This has been tested with version 2.2g. I should also make it work with
  4
+2.1.1 output.
  5
+
  6
+XXX This isn't completely finished and doesn't do everything quite right.
  7
+The main two problems being that it isn't well tested for a wide variety of
  8
+outputs and that the family line is not parsed into it's respective parts 
  9
+(see multitude of comments on this below).
  10
+"""
  11
+from Martel import *
  12
+from Martel import RecordReader
  13
+from Bio import Std
  14
+
  15
+# -- header
  16
+# hmmpfam - search one or more sequences against HMM database
  17
+program_description = (Std.application_name(Str("hmmpfam")) + 
  18
+                       ToEol())
  19
+
  20
+# HMMER 2.2g (August 2001)
  21
+program_version = (Str("HMMER ") +
  22
+                   Std.application_version(Re(r"\d\.\d\w") |
  23
+                                           Re(r"\d\.\d\.\d")) +
  24
+                   ToEol())
  25
+
  26
+# Copyright (C) 1992-2001 HHMI/Washington University School of Medicine
  27
+# Freely distributed under the GNU General Public License (GPL)
  28
+
  29
+copyright = (ToEol() +
  30
+             ToEol())
  31
+
  32
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  33
+# HMM file:                 /data/hmm/Pfam_fs
  34
+# Sequence file:            hmmpfam_test.fasta
  35
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
  36
+         
  37
+files = (ToEol() +
  38
+         Str("HMM file:") + Spaces() +
  39
+         Std.database_name(UntilEol()) + AnyEol() +
  40
+         Str("Sequence file:") + Spaces() +
  41
+         Group("inputfile_name", UntilEol()) + AnyEol() +
  42
+         ToEol())
  43
+
  44
+header = Std.search_header(program_description + program_version +
  45
+                           copyright + files)
  46
+
  47
+# -- record
  48
+
  49
+#
  50
+# Query sequence: AT1G01040
  51
+# Accession:      [none]
  52
+# Description:    Test Sequence for hmmpfam
  53
+
  54
+sequence_info = (ToEol() + # blank line
  55
+                 Str("Query sequence:") + Spaces() +
  56
+                 Group("query_name", UntilEol()) + AnyEol() +
  57
+                 Str("Accession:") + Spaces() +
  58
+                 Group("query_accession", UntilEol()) + AnyEol() +
  59
+                 Str("Description:") + Spaces() +
  60
+                 Std.query_description(UntilEol()) + AnyEol())
  61
+
  62
+# generic name for models
  63
+# most model names are something like PFwhatever, but we also have some
  64
+# different kinds like AAA and Sigma54_activat
  65
+model_name = Re(r"[\w-]+")
  66
+
  67
+#
  68
+# Scores for sequence family classification (score includes all domains):
  69
+# Model          Description                              Score    E-value  N 
  70
+# --------       -----------                              -----    ------- ---
  71
+
  72
+family_header = (ToEol() + # blank line
  73
+                 Str("Scores") + ToEol() +
  74
+                 Str("Model") + ToEol() +
  75
+                 Str("-----") + ToEol())
  76
+
  77
+# family lines can either be a hit:
  78
+# PF00636        RNase3 domain                            354.9   1.1e-118   2
  79
+# or no hit:
  80
+#         [no hits above thresholds]
  81
+
  82
+no_hit_line = (Spaces() + Str("[no hits above thresholds]") + AnyEol())
  83
+
  84
+family_hit_line = (Group("family_model", model_name) + Spaces() +
  85
+                   # XXX This is a pain in the ass, so I gave up
  86
+                   # We need to match descriptions but stop when we get to the
  87
+                   # score value.
  88
+                   # This is very difficult since the description can contain
  89
+                   # letters, numbers, (, ), /, - and periods. The numbers and
  90
+                   # periods leave me unable to think of a way to distinguish
  91
+                   # a "description word" from a score word.
  92
+                   # You also can't use spaces as descriptions can have 2
  93
+                   # spaces in them (not just one separating every word) and
  94
+                   # the description to score value can also be separated by
  95
+                   # only 2 spaces.
  96
+                   # I can't for the life of me figure this so I just
  97
+                   # eat the rest of the line for right now and downstream
  98
+                   # apps will have to get the information out from there.
  99
+                    ToEol("family_information"))
  100
+                   #Group("family_description",
  101
+                   #    Rep1(Re(r"[a-zA-Z0-9_()/-]+") +
  102
+                   #         Alt(Str("  "), Str(" ")))) +
  103
+                   # Alt(Spaces()) +
  104
+                   # Float("family_score") + Spaces() +
  105
+                   # Float("family_evalue") + Spaces() +
  106
+                   # Integer("family_n") + AnyEol())
  107
+
  108
+#
  109
+# Parsed for domains:
  110
+# Model          Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
  111
+# --------       ------- ----- -----    ----- -----      -----  -------
  112
+domain_header = (ToEol() + # blank line
  113
+                 Str("Parsed for domains:") + AnyEol() +
  114
+                 Str("Model") + ToEol() +
  115
+                 Str("-----") + ToEol())
  116
+
  117
+# domain lines can also be either a hit:
  118
+# PF00270          1/1     239   382 ..     1   141 [.    30.3  2.2e-09
  119
+# or not hit:
  120
+#         [no hits above thresholds]
  121
+
  122
+# I have no idea what these things exactly mean, but parse 'em anyways
  123
+symbol_forward = Str(".") | Str("[")
  124
+symbol_reverse = Str(".") | Str("]")
  125
+match_symbols = symbol_forward + symbol_reverse
  126
+
  127
+domain_hit_line = (Group("domain_model", model_name) + Spaces() +
  128
+                   Group("domain_domain", Integer() + Str("/") + Integer()) +
  129
+                   Spaces() +
  130
+                   Integer("domain_seq-f") + Spaces() +
  131
+                   Integer("domain_seq-r") + Spaces() +
  132
+                   Group("domain_seq_symbols", match_symbols) + Spaces() +
  133
+                   Integer("domain_hmm-f") + Spaces() +
  134
+                   Integer("domain_hmm-t") + Spaces() +
  135
+                   Group("domain_hmm_symbols", match_symbols) + Spaces() +
  136
+                   Float("domain_score") + Spaces() +
  137
+                   Float("domain_evalue") + AnyEol())
  138
+
  139
+# 
  140
+# Alignments of top-scoring domains:
  141
+alignment_header = (ToEol() + # empty line
  142
+                    Str("Alignments of top-scoring domains:") +
  143
+                    AnyEol())
  144
+
  145
+# PF00271: domain 1 of 1, from 686 to 767: score 58.6, E = 8.9e-17
  146
+#                    *->ell.epgikvarlhG.....glsqeeReeilekFrngkskvLvaTdv
  147
+#                       el ++  i++a++ G+++++++++++ ++++ kFr+g + +LvaT+v
  148
+#    AT1G01040   686    ELPsLSFIRCASMIGhnnsqEMKSSQMQDTISKFRDGHVTLLVATSV 732  
  149
+# 
  150
+#                    aarGlDipdvnlVInydlpwnpesyiQRiGRaGRaG<-*
  151
+#                    a++GlDi ++n+V  +dl +++  yiQ +GR +R     
  152
+#    AT1G01040   733 AEEGLDIRQCNVVMRFDLAKTVLAYIQSRGR-ARKP    767
  153
+#
  154
+domain_align_header = (Group("dalign_name", model_name) +
  155
+                       Str(": domain ") +
  156
+                       Integer("dalign_of_domain") +
  157
+                       Str(" of ") +
  158
+                       Integer("dalign_total_domain") +
  159
+                       Str(", from ") +
  160
+                       Integer("dalign_domain_start") +
  161
+                       Str(" to ") +
  162
+                       Integer("dalign_domain_end") +
  163
+                       Str(": score ") +
  164
+                       Float("dalign_score") +
  165
+                       Str(", E = ") +
  166
+                       Float("dalign_evalue") +
  167
+                       AnyEol())
  168
+
  169
+# occasionally we have a line like:
  170
+#                 RF xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
  171
+# preceeding the alignment. I'm not sure what this means.
  172
+rf_line = (Spaces() + Str("RF ") + ToEol())
  173
+
  174
+domain_align_top = (Spaces() +
  175
+                    UntilEol("dalign_match_top") + AnyEol())
  176
+
  177
+domain_align_middle = (Spaces() +
  178
+                       UntilEol("dalign_match_middle") + AnyEol())
  179
+
  180
+domain_align_bottom = (Spaces() +
  181
+                       ToSep("dalign_query_name", " ") + Spaces() +
  182
+                       # some match ends can have:
  183
+                       # AT1G01230     -   -
  184
+                       # instead of an actual line, so there are fixes
  185
+                       # in here for that messed up case
  186
+                       Alt(Integer("dalign_query_start") + Spaces() +
  187
+                           Group("dalign_match_bottom",
  188
+                               Re("[\w\-]+")) + Spaces() +
  189
+                           Integer("dalign_query_end") +
  190
+                           Spaces() + AnyEol(),
  191
+                           Str("- ") + ToEol()) +
  192
+                       ToEol()) # blank line
  193
+
  194
+domain_alignment = (domain_align_header +
  195
+                    Rep1(Opt(rf_line) +
  196
+                         domain_align_top +
  197
+                         domain_align_middle +
  198
+                         domain_align_bottom))
  199
+
  200
+# //
  201
+record_end = Str("//") + AnyEol()
  202
+
  203
+
  204
+record = Std.record(Rep(sequence_info +
  205
+                        family_header +
  206
+                        (no_hit_line | Rep1(family_hit_line)) +
  207
+                        domain_header +
  208
+                        (no_hit_line | Rep1(domain_hit_line)) +
  209
+                        alignment_header +
  210
+                        (no_hit_line | Rep1(domain_alignment)) +
  211
+                        record_end
  212
+                    ))
  213
+
  214
+format = HeaderFooter("hmmpfam", {},
  215
+                      header, RecordReader.CountLines, (8,),
  216
+                      record, RecordReader.EndsWith, ("//\n",),
  217
+                      None, None, None)

0 notes on commit 21c0513

Please sign in to comment.
Something went wrong with that request. Please try again.