-
Notifications
You must be signed in to change notification settings - Fork 0
/
flashdownloader.py
282 lines (220 loc) · 9.51 KB
/
flashdownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""Download higher quality videos from Echo360.
From a given URL can download the individual .swf file, convert and stitch
them together. Also downloads the audio file, and adds that to the video.
Include the URL for the lecture as a command line argument when running the script.
The video will be saved in the download directory specified in the configuration file,
named with the GUID. The URL to use can be found in the RSS feed for the lecture.
Example URL:
http://lectureplayback.qut.edu.au/1723/3/b526eb48-ddb7-418a-8051-3b7b4295dbc7/whatever
Does not work with URLs like:
http://lecturecapture.qut.edu.au/ess/echo/presentation/b526eb48-ddb7-418a-8051-3b7b4295dbc7/whatever
Todo:
* Move the temp folder so that dropbox doesnt see it
* Moar progress bars
* Maybe add the QUT intro to the videos
* Restructure how the video path works, so that when run from the command
line, it saves with the correct name
"""
from urllib.request import urlopen, URLopener
from xml.dom import minidom
import os
import sys
import ffmpy
import json
# open the configuration file and save config as constants
with open("config.json", 'r') as ymlfile:
CONFIG = json.load(ymlfile)
def get_swf_url(rssurl):
"""Get url for files from the url that is given by the rss feed.
Args:
- rssurl (str): The URL given by the rss feed
Returns:
- str: base URL for lecture, to be used with other functions in this module
"""
if rssurl[:4]=="https":
return rssurl[:79]
else:
return rssurl[:78]
def get_xml(url):
"""Download the presentation xml document.
Args:
- url (str): The base url for the lecture
Returns:
- obj: The minidom object of the xml document for the lecture
"""
xml_file = urlopen(url+"presentation.xml").read()
xmldoc = minidom.parseString(xml_file)
return xmldoc
def get_max_time(xmldoc):
"""Get the maximum swf time/filename needed as integer.
Args:
- xmldoc (obj): The minidom object for the lecture's xml document
Returns:
- int: The largest time code from the lecture
"""
datas = xmldoc.getElementsByTagName("data")
max_time = 0
for data in datas:
dtype = data.getAttribute("type")
if dtype == "swf":
dtime = int(data.getAttribute("time"))
if dtime > max_time:
max_time = dtime
return max_time
def get_title(xmldoc):
title = str(xmldoc.getElementsByTagName("name")[0].firstChild.nodeValue)
return title
def get_guid(xmldoc):
"""Get the guid from the presentation xml document.
The guid is a unique identifier for the lecture. Temporary files for the
download are stored in a folder named with the guid.
Args:
- xmldoc (obj): The minidom object for the lecture's xml document
Returns:
- str: The lecture's guid, a unique identifier
"""
guid = str(xmldoc.getElementsByTagName("guid")[0].firstChild.nodeValue)
if not os.path.exists(os.path.join(DOWNLOAD_DIRECTORY,guid)):
os.makedirs(os.path.join(DOWNLOAD_DIRECTORY,guid))
return guid
def download_swf_video_file(time, url, guid):
"""Download single swf video file.
The download is stored in the temporary folder named using the GUID
Args:
- time (int): Time code to be downloaded
- url (str): The lecture's base url
- guid (str): The lecture's guid
"""
URLopener().retrieve(url+"/slides/"+'{0:08d}'.format(time)+".swf",
os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time)+".swf"),
reporthook=download_progress_bar)
def download_progress_bar(count, block_size, total_size):
"""To provide a progress bar to show when downloading files."""
percent = int(count*block_size*100/total_size)
numhash = int(percent/5)
numdash = int(20 - numhash)
sys.stdout.write("\r" + "[" + numhash*"#" + numdash*"-" + "] {0}%".format(percent))
sys.stdout.flush()
def download_all_swf_videos(max_time, url, guid):
"""Download all the videos from time 0 to time max_time.
The downloads are stored in the tempoary folder named using the GUID
Args:
- max_time (int): The maximum time code that needs to be downloaded
- url (str): The lecture's base url
- guid (str): The lecture's guid
"""
for time in range(0, max_time+1, 8000):
print("\nDownloading video file {:.0f} of {:.0f}...".format(time/8000+1, max_time/8000+1))
download_swf_video_file(time, url, guid)
def download_audio_file(url, guid):
"""Download the audio file for the lecture.
Downloads the mps audio recording of the lecture. File is stored in the
tempoary folder named using the GUID
Args:
- url (str): The lecture's base url
- guid (str): The lecture's guid
"""
print("\nDownloading audio file")
URLopener().retrieve(url+"/audio.mp3", os.path.join(DOWNLOAD_DIRECTORY, guid, "audio.mp3"),
reporthook=download_progress_bar)
def convert_videos(max_time, guid):
"""Convert all the swf files to mkv files.
Converts swf videos from time 0 to time max_time contained within the GUID folder
to mkv videos. Deletes the old swf files.
Args:
- max_time (int): The maximum time code that needs to be downloaded
- guid (str): The lecture's guid
"""
for time in range(0, max_time+1, 8000):
ff_command = ffmpy.FFmpeg(
inputs={os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time)+'.swf'): None},
outputs={os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time)+".mkv"): None}
)
ff_command.run()
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time)+'.swf'))
def concat_videos(max_time, guid):
"""Concatonate all the videos together, into one video file.
Joins together videos from time 0 to max_time contained within the GUID
folder. Deletes the tempoary video files.
Args:
- max_time (int): The maximum time code that needs to be downloaded
- guid (str): The lecture's guid
"""
# create text file of all input files
file = open(os.path.join(DOWNLOAD_DIRECTORY, guid, "input.txt"), "w")
for time in range(0, max_time+1, 8000):
file.write("file '" + os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time))+".mkv'\n")
file.close()
# run FFmpeg
ff_command = ffmpy.FFmpeg(
inputs= {
os.path.join(DOWNLOAD_DIRECTORY, guid, "input.txt"):"-f concat -safe 0"
},
outputs={
os.path.join(DOWNLOAD_DIRECTORY, guid, "video_output.mkv"):"-codec copy"
}
)
ff_command.run()
for time in range(0, max_time+1, 8000):
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, '{0:08d}'.format(time)+'.mkv'))
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, "input.txt"))
def trim_audio_file(guid):
"""Trims the audio file to remove the qut intro sound.
Trims the audio file name "audio.mp3" contained within the GUID folder to
remove the first 15 seconds.
Args:
- guid (str): The lecture's guid
"""
ff_command = ffmpy.FFmpeg(
inputs={os.path.join(DOWNLOAD_DIRECTORY, guid, "audio.mp3"):None},
outputs={os.path.join(DOWNLOAD_DIRECTORY, guid, "trimmed_audio.mp3"):"-ss 00:00:15 -acodec copy"}
)
ff_command.run()
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, "audio.mp3"))
def combine_audio_and_video(guid, video_path):
"""Combine the trimmed audio and the concatonated video files.
Combines the video file named "video_output.mkv" and the audio file
named "trimmed_audio.mp3", found within the GUID folder.
Args:
- guid (str): The lecture's guid
- video_path (str): The path for the final video to be saved to
video_path also requires the file name and extension.
"""
ff_command = ffmpy.FFmpeg(
inputs={os.path.join(DOWNLOAD_DIRECTORY, guid, "video_output.mkv"): None,
os.path.join(DOWNLOAD_DIRECTORY, guid, "trimmed_audio.mp3"): None},
outputs={video_path: "-codec copy -shortest"}
)
ff_command.run()
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, "video_output.mkv"))
os.remove(os.path.join(DOWNLOAD_DIRECTORY, guid, "trimmed_audio.mp3"))
os.rmdir(os.path.join(DOWNLOAD_DIRECTORY, guid))
def high_quality_download(url, video_path):
"""Download a lecture from Echo360 in high quality.
Given the url provided by the RSS feed for a lecture recording, this function
will download the recording in HD and save it to the specified video path.
Args:
- url (str): The lecture's base url
- video_path (str): The path for the final video to be saved to
"""
newurl = get_swf_url(url)
xmldoc = get_xml(newurl)
max_time = get_max_time(xmldoc)
guid = get_guid(xmldoc)
download_all_swf_videos(max_time, newurl, guid)
download_audio_file(newurl, guid)
convert_videos(max_time, guid)
concat_videos(max_time, guid)
trim_audio_file(guid)
combine_audio_and_video(guid, video_path)
if len(sys.argv) != 2 and sys.argv[0][-19:] == "flashdownloader.py":
print("Enter url for lecture as a command line argument")
elif sys.argv[0][-19:] == "flashdownloader.py":
url = sys.argv[1]
newurl = get_swf_url(url)
xmldoc = get_xml(newurl)
guid = get_guid(xmldoc)
try:
high_quality_download(sys.argv[1], os.path.join(DOWNLOAD_DIRECTORY, guid+".mkv"))
except:
URLopener().retrieve(sys.argv[1], os.path.join(DOWNLOAD_DIRECTORY, guid+".mp4"))