/
spider
executable file
·202 lines (180 loc) · 5.47 KB
/
spider
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env bash
#
# Download baseball data (XML) files from MLBAM.
#
# All files are run through `xmllint --format`. Content is verified after
# formatting to ensure xmllint isn't doing anything funny. If there is a
# difference between the file contents pre-and post-processing, a message
# will be printed to STDERR and the downloaded file is stored in the same
# directory with a .orig extension.
#
# Progress reports and non-critical warnings are sent to standard output
# and errors that need user attention are sent to standard error, so you
# can run the script and keep two separate log files, for example:
#
# spider [options] >spider.log 2>spider.error.log
#
# or, if you want to view progress and just log errors:
#
# spider [options] 2>spider.error.log
#
# This script strives for POSIX-compliance where possible but, among other
# things, requires the GNU date command which supports the -d option for
# specifying a custom date. It works on Ubuntu and does not work on OSX,
# which comes with the BSD date command.
#
# REQUIREMENTS
#
# You must have curl and xmllint installed. On Ubuntu:
#
# sudo apt-get install libxml2-utils curl
#
usage() {
echo "Usage:"
printf " %s [options] [output_dir]\n" $(basename $0)
echo ""
echo "Options:"
echo " -y <year> Year for which to download files (default: $(date +%Y))."
echo " -l <league> League: mlb, milb, etc (default: mlb)."
echo " -t Include today's files (othewise stops at yesterday)."
echo " -q Be quiet: output errors only (on STDERR)."
echo " -v Be verbose: output detailed progress messages."
echo " -w Overwrite existing files."
echo " -h Print this help."
}
# print a message
msg() {
case $1 in
error) echo "Error: $2" >&2 ;;
notice) $quiet || echo "$2" ;;
progress) $quiet || echo "$2" ;;
warning) $verbose && echo "Warning: $2" ;;
detail) $verbose && echo "$2" ;;
esac
}
# takes a timestamp and format string
formatdate() {
date -d "1970-01-01 $1 sec" +"$2"
}
# month_xx/day_xx portion of URL or path for the given date
datepath() {
formatdate "$1" "month_%m/day_%d"
}
# base MLBAM URL for a given date
dateurl() {
echo "$baseurl/year_$year/$(datepath $1)"
}
# ids of games played on the given date
gameids() {
html=$(curl -sf "$(dateurl $1)/")
curlexit="$?"
if (( curlexit > 0 )); then
msg error "couldn't download $1 (curl exited with code $curlexit)"
return 1
fi
sed -n 's:.*href="\(gid_[^/"]*\)/".*:\1:p' <<<"$html"
}
# files to download for each game
datafiles() {
echo "boxscore.xml"
echo "players.xml"
echo "game.xml"
echo "gameday_Syn.xml"
echo "linescore.xml"
}
# download a file (takes URL and filesystem path)
fetchfile() {
msg detail " $file"
curl -sf "$1" -o "$2.orig"
curlexit="$?"
if (( curlexit == 22 )); then
msg warning "couldn't download $1 (server responded with HTTP >=400)"
return
elif (( curlexit > 0 )); then
msg error "couldn't download $1 (curl exited with code $curlexit)"
return 1
fi
XMLLINT_INDENT=" " xmllint --format "$2.orig" > "$2"
# Some files have an XML declaration, xmllint adds one if it's missing.
# We remove the XML declarations and all whitespace before comparing files.
diff -q <(sed '1{s/<?xml.*?>//}' "$2" | tr -d " \t\n") \
<(sed '1{s/<?xml.*?>//}' "$2.orig" | tr -d " \t\n") > /dev/null
if [ $? -eq 0 ]; then
rm "$2.orig"
else
msg error "unexpected xmllint output for $2"
fi
(( ++downloads ))
(( downloads % 10 == 0 )) && sleep 1
}
check_dependencies(){
xmllint --version > /dev/null 2>&1
if [ $? -gt 0 ]; then
echo "xmllint not found, please install it (on Ubuntu: sudo apt-get install libxml2-utils)."
exit 1
fi
curl --version > /dev/null 2>&1
if [ $? -gt 0 ]; then
echo "curl not found, please install it (on Ubuntu: sudo apt-get install curl)."
exit 1
fi
}
check_dependencies
year=$(date +%Y)
league=mlb
previous=
overwrite=
today=
quiet=false
verbose=false
# keep track of downloads so we can sleep occasionally
downloads=0
while getopts "y:l:p:vqwth" opt; do
case $opt in
y) year="$OPTARG" ;;
l) league="$OPTARG" ;;
p) previous="$OPTARG" ;;
t) today=true ;;
q) quiet=true ;;
v) verbose=true ;;
w) overwrite=true ;;
h) usage; exit 0 ;;
*) usage; exit 1 ;;
esac
done
shift $(( OPTIND - 1 ))
baseurl="http://gdx.mlb.com/components/game/$league"
[ -z $1 ] && outputdir="./mlbam" || outputdir="$1"
# set ending date (seconds since 1970-01-01)
if (( year == $(date +%Y) )); then
lastdate=$(date +%s)
[ $today ] || (( lastdate -= 60*60*24 ))
else
lastdate=$(date -d "$year-11-10" +%s)
fi
# set starting date
if [ -n "$previous" ]; then
firstdate=$(($lastdate - 60*60*24*($previous - 1)))
else
firstdate=$(date -d "$year-02-01" +%s)
fi
msg notice "Downloading data for $league ($(formatdate "$firstdate" "%b %d") - \
$(formatdate "$lastdate" "%b %d"), $year) to $outputdir"
starttime=$(date +%s)
d="$firstdate"
while [ $(formatdate "$d" %F) != $(formatdate "$lastdate" %F) ]; do
for gid in $(gameids $d); do
msg progress "Processing game $gid"
gameurl="$(dateurl $d)/$gid"
gamedir="$outputdir/$(datepath $d)/$gid"
mkdir -p "$gamedir"
for file in $(datafiles); do
if [ ! -f "$gamedir/$file" ] || [ $overwrite ]; then
fetchfile "$gameurl/$file" "$gamedir/$file"
fi
done
done
((d += 60*60*24))
done
elapsed=$(($(date +%s) - $starttime))
msg notice "Finished in $(formatdate "$elapsed" "%H hrs %M mins %S secs")."