-
Notifications
You must be signed in to change notification settings - Fork 182
/
extract_genes.pl
executable file
·129 lines (95 loc) · 3.3 KB
/
extract_genes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/perl -w
# $Id$
=pod
=head1 NAME
extract_genes.pl - extract genomic sequences from NCBI files
using BioPerl
=head1 DESCRIPTION
This script is a simple solution to the problem of
extracting genomic regions corresponding to genes. There are other
solutions, this particular approach uses genomic sequence
files from NCBI and gene coordinates from Entrez Gene.
The first time this script is run it will be slow as it will
extract species-specific data from the gene2accession file and create
a storable hash (retrieving the positional data from this hash is
significantly faster than reading gene2accession each time the script
runs). The subsequent runs should be fast.
=head1 INSTALLATION
=head2
Install BioPerl, full instructions at http://bioperl.org.
=head2 Download gene2accession.gz
Download this file from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA into
your working directory and gunzip it.
=head2 Download sequence files
Create one or more species directories in the working directory, the
directory names do not have to match those at NCBI (e.g. "Sc", "Hs").
Download the nucleotide fasta files for a given species from its CHR*
directories at ftp://ftp.ncbi.nlm.nih.gov/genomes and put these files into a
species directory. The sequence files will have the suffix ".fna" or
"fa.gz", gunzip if necessary.
=head2 Determine Taxon id
Determine the taxon id for the given species. This id is the first column
in the gene2accession file. Modify the %species hash in this script
such that name of your species directory is a key and the taxon id is the
value.
=head2 Command-line options
-i Gene id
-s Name of species directory
-h Help
Example:
extract_genes.pl -i 850302 -s Sc
=cut
use strict;
use Bio::DB::Fasta;
use Getopt::Long;
use Storable;
my %species = ( "Sc" => 4932, # Saccharomyces cerevisiae
"Ec" => 83333, # Escherichia coli K12
"Hs" => 9606 # H. sapiens
);
my ($help,$id,$name);
GetOptions( "s=s" => \$name,
"i=i" => \$id,
"h" => \$help );
usage() if ($help || !$id || !$name);
my $storedHash = $name . ".dump";
# create index for a directory of fasta files
my $db = Bio::DB::Fasta->new($name, -makeid => \&make_my_id);
# extract species-specific data from gene2accession
unless (-e $storedHash) {
my $ref;
# extract species-specific information from gene2accession
open MYIN,"gene2accession" or die "No gene2accession file\n";
while (<MYIN>) {
my @arr = split "\t",$_;
if ($arr[0] == $species{$name} && $arr[9] =~ /\d+/ && $arr[10] =~ /\d+/) {
($ref->{$arr[1]}->{"start"}, $ref->{$arr[1]}->{"end"},
$ref->{$arr[1]}->{"strand"}, $ref->{$arr[1]}->{"id"}) =
($arr[9], $arr[10], $arr[11], $arr[7]);
}
}
# save species-specific information using Storable
store $ref, $storedHash;
}
# retrieve the species-specific data from a stored hash
my $ref = retrieve($storedHash);
# retrieve sequence and sub-sequence
if (defined $ref->{$id}) {
my $chr = $db->get_Seq_by_id($ref->{$id}->{"id"});
my $seq = $chr->trunc($ref->{$id}->{"start"},$ref->{$id}->{"end"});
$seq = $seq->revcom if ($ref->{$id}->{"strand"} eq "-");
# Insert SeqIO options here...
print $seq->seq,"\n";
} else {
print "Cannot find id: $id\n";
}
sub make_my_id {
my $line = shift;
$line =~ /ref\|([^|]+)/;
$1;
}
sub usage {
system "perldoc $0";
exit;
}
__END__