Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 182 lines (160 sloc) 13.2 KB
#!/usr/bin/perl
# PROGRAM : simplealign.pl
# PURPOSE : Simple driver for Bio::SimpleAlign
# AUTHOR : Ewan Birney birney@sanger.ac.uk
# CREATED : Tue Oct 27 1998
#
# INSTALLATION
# If you have installed bioperl using the standard
# makefile system everything should be fine and
# dandy.
#
# if not edit the use lib "...." line to point the directory
# containing your Bioperl modules.
#
#use lib "/nfs/disk21/birney/prog/bioperl/";
# Modified 3/5/01 to use AlignIO by Peter Schattner schattner@alum.mit.edu
#
# This uses the internal DATA stream (past the end of this
# file, on the __END__ tag) to load in the data. We then
# do some reformats, sort in a different way and a quick
# getting into the alignment. All pretty simple ;)
#
#
# The simplealign module does not do the following things
# a) give you sensible ways of asking if residues are a
# column of gaps or conservation
# b) provide ways of editing the alignment
# c) making alignments
#
#
# a) and b) are probably best done by UnivAlign from Georg Fuellen
# c) is done for pairwise alignments in Bio::Tools::pSW; and
# also you can read in stuff from programs like clustal and hmmer
# into this.
#
use strict;
use Bio::SimpleAlign;
use Bio::AlignIO;
# read from a stream
my $str = Bio::AlignIO->newFh('-fh'=> \*DATA, '-format' => 'pfam' );
my $al = <$str>;
# write out a MSF file
my $out = Bio::AlignIO->newFh('-fh'=> \*STDOUT, '-format' => 'msf');
my $status = print $out $al;
# order by alphabetically then start end
$al->sort_alphabetically();
# write in Pfam format now...
my $out2=Bio::AlignIO->newFh( '-fh'=> \*STDOUT, '-format' => 'pfam');
$status = print $out2 $al;
# now set the display name to be
# name_# like roa1_human_1, roa1_human_2 etc
# This **doesn't** change the underlying names of the
# sequences you'll be glad to hear.
$al->set_displayname_count();
# dump again... bored of this yet?
$status = print $out2 $al;
# get into the alignment and get things out
# we just want to see how many unique names
# there are in this alignment
my ($seq, $id, %hash) ;
# loop over the alignment
foreach $seq ( $al->eachSeq() ) {
# increment a hash on the name by one each time
$hash{$seq->id()}++;
}
# disgorge the hash
foreach $id ( keys %hash ) {
print "$id has $hash{$id} subsequences in this alignment\n";
}
__END__
GR10_BRANA/8-79 CFVGGL......AWATGDAELERTFS.....Q.FGEV..IDSKIIND.............RETGRSRGFGFVTFKDEKSMKDAIDEMNG.K...ELDGRTITV
HUD_HUMAN/48-119 LIVNYL......PQNMTQEEFRSLFG.....S.IGEI..ESCKLVRD.............KITGQSLGYGFVNYIDPKDAEKAINTLNG.L...RLQTKTIKV
IF32_SCHPO/41-124 VVIEGAP....VVEEAKQQDFFRFLSSKVLAK.IGKVKENGFYMPFE.........EKNGK..KMSLGLVFADFENVDGADLCVQELDGKQ...ILKNHTFVV
IF32_YEAST/79-157 IVVNGAPVIPSAKVPVLKKALTSLFS.....K.AGKV..VNMEFPID.............EATGKTKGFLFVECGSMNDAKKIIKSFHGKR...LDLKHRLFL
IF4B_HUMAN/98-168 AFLGNL......PYDVTEESIKEFFR.....G.LNIS...AVRLPR............EPSNPERLKGFGYAEFEDLDSLLSALS.LNE.E...SLGNRRIRV
LA_DROME/151-225 AYAKGF......PLDSQISELLDFTA.....N.YDKV..VNLTMRNS.........YDKPTKSYKFKGSIFLTFETKDQAKAFLE.QEK.I...VYKERELLR
LA_HUMAN/113-182 VYIKGF......PTDATLDDIKEWLE.....D.KGQV..LNIQMRR..............TLHKAFKGSIFVVFDSIESAKKFVE.TPG.Q...KYKETDLLI
MEI2_SCHPO/197-265 LFVTNL......PRIVPYATLLELFS.....K.LGDV..KGIDTSSL.................STDGICIVAFFDIRQAIQAAKSLRSQR...FFNDRLLYF
MODU_DROME/177-246 VFVTNL......PNEYLHKDLVALFA.....K.FGRL..SALQRFTN................LNGNKSVLIAFDTSTGAEAVLQAKPKAL...TLGDNVLSV
MODU_DROME/260-326 VVVGLI......GPNITKDDLKTFFE.....K.VAPV..EAVTISSN.................RLMPRAFVRLASVDDIPKALK.LHS.T...ELFSRFITV
MODU_DROME/342-410 LVVENVG....KHESYSSDALEKIFK.....K.FGDV..EEIDVVC..................SKAVLAFVTFKQSDAATKALAQLDG.K...TVNKFEWKL
MODU_DROME/422-484 ILVTNL......TSDATEADLRKVFN.....D.SGEI..ESIIMLG.....................QKAVVKFKDDEGFCKSFL.ANE.S...IVNNAPIFI
MSSP_HUMAN/31-102 LYIRGL......PPHTTDQDLVKLCQ.....P.YGKI..VSTKAILD.............KTTNKCKGYGFVDFDSPAAAQKAVSALKA.S...GVQAQKAKQ
NAM8_YEAST/165-237 IFVGDL......APNVTESQLFELFI.....NRYAST..SHAKIVHD.............QVTGMSKGYGFVKFTNSDEQQLALSEMQG.V...FLNGRAIKV
NONA_DROME/304-369 LYVGNL......TNDITDDELREMFK.....P.YGEI..SEIFSNLD...................KNFTFLKVDYHPNAEKAKRALDG.S...MRKGRQLRV
NONA_DROME/378-448 LRVSNL......TPFVSNELLYKSFE.....I.FGPI..ERASITVD..............DRGKHMGEGIVEFAKKSSASACLRMCNE.K...CFFLTASLR
NOP3_YEAST/127-190 LFVRPF......PLDVQESELNEIFG.....P.FGPM..KEVKILN.....................GFAFVEFEEAESAAKAIEEVHG.K...SFANQPLEV
NOP3_YEAST/202-270 ITMKNL......PEGCSWQDLKDLAR.....E.NSLE..TTFSSVN................TRDFDGTGALEFPSEEILVEALERLNN.I...EFRGSVITV
NOP4_YEAST/28-98 LFVRSI......PQDVTDEQLADFFS.....N.FAPI..KHAVVVKD..............TNKRSRGFGFVSFAVEDDTKEALAKARK.T...KFNGHILRV
NOP4_YEAST/292-363 VFVRNV......PYDATEESLAPHFS.....K.FGSV..KYALPVID.............KSTGLAKGTAFVAFKDQYTYNECIKNAPA.A...GSTSLLIGD
NSR1_YEAST/170-241 IFVGRL......SWSIDDEWLKKEFE.....H.IGGV..IGARVIYE.............RGTDRSRGYGYVDFENKSYAEKAIQEMQG.K...EIDGRPINC
NSR1_YEAST/269-340 LFLGNL......SFNADRDAIFELFA.....K.HGEV..VSVRIPTH.............PETEQPKGFGYVQFSNMEDAKKALDALQG.E...YIDNRPVRL
NUCL_CHICK/283-352 LFVKNL......TPTKDYEELRTAIK.....EFFGKK...NLQVSEV..............RIGSSKRFGYVDFLSAEDMDKALQ.LNG.K...KLMGLEIKL
PABP_DROME/4-75 LYVGDL......PQDVNESGLFDKFS.....S.AGPV..LSIRVCRD.............VITRRSLGYAYVNFQQPADAERALDTMNF.D...LVRNKPIRI
PABP_DROME/92-162 VFIKNL......DRAIDNKAIYDTFS.....A.FGNI..LSCKVATD..............EKGNSKGYGFVHFETEEAANTSIDKVNG.M...LLNGKKVYV
PABP_DROME/183-254 VYVKNF......TEDFDDEKLKEFFE.....P.YGKI..TSYKVMS..............KEDGKSKGFGFVAFETTEAAEAAVQALNGKD...MGEGKSLYV
PABP_SCHPO/249-319 VYIKNL......DTEITEQEFSDLFG.....Q.FGEI..TSLSLVKD..............QNDKPRGFGFVNYANHECAQKAVDELND.K...EYKGKKLYV
PES4_YEAST/93-164 LFIGDL......HETVTEETLKGIFK.....K.YPSF..VSAKVCLD.............SVTKKSLGHGYLNFEDKEEAEKAMEELNY.T...KVNGKEIRI
PES4_YEAST/305-374 IFIKNL......PTITTRDDILNFFS.....E.VGPI..KSIYLSN...............ATKVKYLWAFVTYKNSSDSEKAIKRYNN.F...YFRGKKLLV
PR24_YEAST/43-111 VLVKNL......PKSYNQNKVYKYFK.....H.CGPI..IHVDVAD...............SLKKNFRFARIEFARYDGALAAIT.KTH.K...VVGQNEIIV
PR24_YEAST/119-190 LWMTNF......PPSYTQRNIRDLLQ.....D.INVV.ALSIRLPSL..............RFNTSRRFAYIDVTSKEDARYCVEKLNG.L...KIEGYTLVT
PR24_YEAST/212-284 IMIRNL.....STELLDENLLRESFE.....G.FGSI..EKINIPAG............QKEHSFNNCCAFMVFENKDSAERALQ.MNR.S...LLGNREISV
PSF_HUMAN/373-443 LSVRNL......SPYVSNELLEEAFS.....Q.FGPI..ERAVVIVD..............DRGRSTGKGIVEFASKPAARKAFERCSE.G...VFLLTTTPR
PTB_HUMAN/61-128 IHIRKL......PIDVTEGEVISLGL.....P.FGKV..TNLLMLKG...................KNQAFIEMNTEEAANTMVN.YYT.SVTPVLRGQPIYI
PTB_HUMAN/186-253 IIVENL......FYPVTLDVLHQIFS.....K.FGTV....LKIIT...............FTKNNQFQALLQYADPVSAQHAKLSLDG.Q...NIYNACCTL
PUB1_YEAST/76-146 LYVGNL......DKAITEDILKQYFQ.....V.GGPI..ANIKIMID..............KNNKNVNYAFVEYHQSHDANIALQTLNG.K...QIENNIVKI
PUB1_YEAST/163-234 LFVGDL......NVNVDDETLRNAFK.....D.FPSY..LSGHVMWD.............MQTGSSRGYGFVSFTSQDDAQNAMDSMQG.Q...DLNGRPLRI
PUB1_YEAST/342-407 AYIGNI......PHFATEADLIPLFQ.....N.FGFI..LDFKHYPE...................KGCCFIKYDTHEQAAVCIVALAN.F...PFQGRNLRT
RB97_DROME/34-104 LFIGGL......APYTTEENLKLFYG.....Q.WGKV..VDVVVMRD.............AATKRSRGFGFITYTKSLMVDRAQE..NRPH...IIDGKTVEA
RN12_YEAST/200-267 IVIKFQ......GPALTEEEIYSLFR.....R.YGTI....IDIFP...............PTAANNNVAKVRYRSFRGAISAKNCVSG.I...EIHNTVLHI
RN15_YEAST/20-91 VYLGSI......PYDQTEEQILDLCS.....N.VGPV..INLKMMFD.............PQTGRSKGYAFIEFRDLESSASAVRNLNG.Y...QLGSRFLKC
RNP1_YEAST/37-109 LYVGNL......PKNCRKQDLRDLFE.....PNYGKI..TINMLKKK.............PLKKPLKRFAFIEFQEGVNLKKVKEKMNG.K...IFMNEKIVI
RO28_NICSY/99-170 LFVGNL......PYDIDSEGLAQLFQ.....Q.AGVV..EIAEVIYN.............RETDRSRGFGFVTMSTVEEADKAVELYSQ.Y...DLNGRLLTV
RO33_NICSY/116-187 LYVGNL......PFSMTSSQLSEIFA.....E.AGTV..ANVEIVYD.............RVTDRSRGFAFVTMGSVEEAKEAIRLFDG.S...QVGGRTVKV
RO33_NICSY/219-290 LYVANL......SWALTSQGLRDAFA.....D.QPGF..MSAKVIYD.............RSSGRSRGFGFITFSSAEAMNSALDTMNE.V...ELEGRPLRL
ROA1_BOVIN/106-176 IFVGGI......KEDTEEHHLRDYFE.....Q.YGKI..EVIEIMTD.............RGSGKKRGFAFVTFDDHDSVDKIVI.QKY.H...TVNGHNCEV
ROC_HUMAN/18-82 VFIGNL.....NTLVVKKSDVEAIFS.....K.YGKI..VGCSVHK.....................GFAFVQYVNERNARAAVAGEDG.R...MIAGQVLDI
ROF_HUMAN/113-183 VRLRGL......PFGCTKEEIVQFFS.....G.LEIV.PNGITLPVD..............PEGKITGEAFVQFASQELAEKALG.KHK.E...RIGHRYIEV
ROG_HUMAN/10-81 LFIGGL......NTETNEKALEAVFG.....K.YGRI..VEVLLMKD.............RETNKSRGFAFVTFESPADAKDAARDMNG.K...SLDGKAIKV
RT19_ARATH/33-104 LYIGGL......SPGTDEHSLKDAFS.....S.FNGV..TEARVMTN.............KVTGRSRGYGFVNFISEDSANSAISAMNG.Q...ELNGFNISV
RU17_DROME/104-175 LFIARI......NYDTSESKLRREFE.....F.YGPI..KKIVLIHD.............QESGKPKGYAFIEYEHERDMHAAYKHADG.K...KIDSKRVLV
RU1A_HUMAN/12-84 IYINNLNE..KIKKDELKKSLYAIFS.....Q.FGQI..LDILVSR................SLKMRGQAFVIFKEVSSATNALRSMQG.F...PFYDKPMRI
RU1A_HUMAN/210-276 LFLTNL......PEETNELMLSMLFN.....Q.FPGF..KEVRLVPG..................RHDIAFVEFDNEVQAGAARDALQG.F...KITQNNAMK
RU1A_YEAST/229-293 LLIQNL......PSGTTEQLLSQILG.....N.EALV...EIRLVSV...................RNLAFVEYETVADATKIKNQLGS.T...YKLQNNDVT
RU2B_HUMAN/9-81 IYINNMND..KIKKEELKRSLYALFS.....Q.FGHV..VDIVALK................TMKMRGQAFVIFKELGSSTNALRQLQG.F...PFYGKPMRI
RU2B_HUMAN/153-220 LFLNNL......PEETNEMMLSMLFN.....Q.FPGF..KEVRLVPG..................RHDIAFVEFENDGQAGAARDALQGFK...ITPSHAMKI
SC35_CHICK/16-87 LKVDNL......TYRTSPDTLRRVFE.....K.YGRV..GDVYIPRD.............RYTKESRGFAFVRFHDKRDAEDAMDAMDG.A...VLDGRELRV
SP33_HUMAN/17-85 IYVGNL......PPDIRTKDIEDVFY.....K.YGAI..RDIDLKNR................RGGPPFAFVEFEDPRDAEDAVYGRDG.Y...DYDGYRLRV
SP33_HUMAN/122-186 VVVSGL......PPSGSWQDLKDHMR.....E.AGDV..CYADVYRD....................GTGVVEFVRKEDMTYAVRKLDN.T...KFRSHEGET
SQD_DROME/58-128 LFVGGL......SWETTEKELRDHFG.....K.YGEI..ESINVKTD.............PQTGRSRGFAFIVFTNTEAIDKVSA.ADE.H...IINSKKVDP
SQD_DROME/138-208 IFVGGL......TTEISDEEIKTYFG.....Q.FGNI..VEVEMPLD.............KQKSQRKGFCFITFDSEQVVTDLLK.TPK.Q...KIAGKEVDV
SR55_DROME/5-68 VYVGGL......PYGVRERDLERFFK.....G.YGRT..RDILIKN.....................GYGFVEFEDYRDADDAVYELNG.K...ELLGERVVV
SSB1_YEAST/39-114 IFIGNV......AHECTEDDLKQLFV.....EEFGDE..VSVEIPIK..........EHTDGHIPASKHALVKFPTKIDFDNIKENYDT.K...VVKDREIHI
SXLF_DROME/127-198 LIVNYL......PQDMTDRELYALFR.....A.IGPI..NTCRIMRD.............YKTGYSFGYAFVDFTSEMDSQRAIKVLNG.I...TVRNKRLKV
SXLF_DROME/213-285 LYVTNL......PRTITDDQLDTIFG.....K.YGSI..VQKNILRD.............KLTGRPRGVAFVRYNKREEAQEAISALNNVI...PEGGSQPLS
TIA1_HUMAN/9-78 LYVGNL......SRDVTEALILQLFS.....Q.IGPC..KNCKMIMD...............TAGNDPYCFVEFHEHRHAAAALAAMNG.R...KIMGKEVKV
TIA1_HUMAN/97-168 VFVGDL......SPQITTEDIKAAFA.....P.FGRI..SDARVVKD.............MATGKSKGYGFVSFFNKWDAENAIQQMGG.Q...WLGGRQIRT
TIA1_HUMAN/205-270 VYCGGV......TSGLTEQLMRQTFS.....P.FGQI..MEIRVFPD...................KGYSFVRFNSHESAAHAIVSVNG.T...TIEGHVVKC
TRA2_DROME/99-170 IGVFGL......NTNTSQHKVRELFN.....K.YGPI..ERIQMVID.............AQTQRSRGFCFIYFEKLSDARAAKDSCSG.I...EVDGRRIRV
U2AF_HUMAN/261-332 LFIGGL......PNYLNDDQVKELLT.....S.FGPL..KAFNLVKD.............SATGLSKGYAFCEYVDINVTDQAIAGLNG.M...QLGDKKLLV
U2AF_SCHPO/312-383 IYISNL......PLNLGEDQVVELLK.....P.FGDL..LSFQLIKN.............IADGSSKGFCFCEFKNPSDAEVAISGLDG.K...DTYGNKLHA
U2AG_HUMAN/67-142 CAVSDVEM..QEHYDEFFEEVFTEME.....EKYGEV..EEMNVCDN..............LGDHLVGNVYVKFRREEDAEKAVIDLNN.R...WFNGQPIHA
WHI3_YEAST/540-614 LYVGNL......PSDATEQELRQLFS.....G.QEGF..RRLSFRNK..........NTTSNGHSHGPMCFVEFDDVSFATRALAELYG.R...QLPRSTVSS
X16_HUMAN/12-78 VYVGNL......GNNGNKTELERAFG.....Y.YGPL..RSVWVARN..................PPGFAFVEFEDPRDAADAVRELDG.R...TLCGCRVRV
YHC4_YEAST/348-415 IFVGQL......DKETTREELNRRFS.....T.HGKI..QDINLIFK.................PTNIFAFIKYETEEAAAAALESENH.A...IFLNKTMHV
YHH5_YEAST/315-384 ILVKNL......PSDTTQEEVLDYFS.....T.IGPI..KSVFISEK...............QANTPHKAFVTYKNEEESKKAQKCLNK.T...IFKNHTIWV
YIS1_YEAST/66-136 IFVGNI......TPDVTPEQIEDHFK.....D.CGQI..KRITLLYD.............RNTGTPKGYGYIEFESPAYREKALQ.LNG.G...ELKGKKIAV
YIS5_YEAST/33-104 IYIGNL......NRELTEGDILTVFS.....E.YGVP..VDVILSRD.............ENTGESQGFAYLKYEDQRSTILAVDNLNG.F...KIGGRALKI
ARP2_PLAFA/364-438 VEVTYLF....STYLVNGQTL..IYS.....N.ISVV....LVILY........HQKFKETVLGRNSGFGFVSYDNVISAQHAIQFMNG.Y...FVNNKYLKV
CABA_MOUSE/77-147 MFVGGL......SWDTSKKDLKDYFT.....K.FGEV..VDCTIKMD.............PNTGRSRGFGFILFKDSSSVEKVLD.QKE.H...RLDGRVIDP
CABA_MOUSE/161-231 IFVGGL......NPEATEEKIREYFG.....Q.FGEI..EAIELPID.............PKLNKRRGFVFITFKEEDPVKKVLE.KKF.H...TVSGSKCEI
CPO_DROME/453-526 LFVSGL......PMDAKPRELYLLFR.....A.YEGY..EGSLLKV............TSKNGKTASPVGFVTFHTRAGAEAAKQDLQGVR...FDPDMPQTI
CST2_HUMAN/18-89 VFVGNI......PYEATEEQLKDIFS.....E.VGPV..VSFRLVYD.............RETGKPKGYGFCEYQDQETALSAMRNLNG.R...EFSGRALRV
D111_ARATH/281-360 LLLRNMVG.PGQVDDELEDEVGGECA.....K.YGTV..TRVLIFE..........ITEPNFPVHEAVRIFVQFSRPEETTKALVDLDG.R...YFGGRTVRA
ELAV_DROME/250-322 LYVSGL......PKTMTQQELEAIFA.....P.FGAI..ITSRILQN............AGNDTQTKGVGFIRFDKREEATRAIIALNG.T...TPSSCTDPI
ELAV_DROME/404-475 IFIYNL......APETEEAALWQLFG.....P.FGAV..QSVKIVKD.............PTTNQCKGYGFVSMTNYDEAAMAIRALNG.Y...TMGNRVLQV
EWS_HUMAN/363-442 IYVQGL......NDSVTLDDLADFFK.....Q.CGVV..K.MNKRTG....QPMIHIYLDKETGKPKGDATVSYEDPPTAKAAVEWFDG.K...DFQGSKLKV
GBP2_YEAST/124-193 IFVRNL......TFDCTPEDLKELFG.....T.VGEV..VEADIIT...............SKGHHRGMGTVEFTKNESVQDAISKFDG.A...LFMDRKLMV
GBP2_YEAST/221-291 VFIINL......PYSMNWQSLKDMFK.....E.CGHV..LRADVELD..............FNGFSRGFGSVIYPTEDEMIRAIDTFNG.M...EVEGRVLEV
GBP2_YEAST/351-421 IYCSNL......PFSTARSDLFDLFG.....P.IGKI..NNAELKP..............QENGQPTGVAVVEYENLVDADFCIQKLNN.Y...NYGGCSLQI