forked from yt-dlp/yt-dlp
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ie/Sejm,RedCDNLivx] Add extractors (yt-dlp#8676)
Authored by: selfisekai
- Loading branch information
1 parent
99e67e4
commit 8450a26
Showing
3 changed files
with
355 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import functools | ||
|
||
from .common import InfoExtractor | ||
from ..networking import HEADRequest | ||
from ..utils import ( | ||
float_or_none, | ||
int_or_none, | ||
join_nonempty, | ||
parse_qs, | ||
update_url_query, | ||
) | ||
from ..utils.traversal import traverse_obj | ||
|
||
|
||
class RedCDNLivxIE(InfoExtractor): | ||
_VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P<tenant>[^/?#]+)/(?P<id>[^?#]+)\.livx' | ||
IE_NAME = 'redcdnlivx' | ||
|
||
_TESTS = [{ | ||
'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000', | ||
'info_dict': { | ||
'id': 'ENC02-638272860000-638292544000', | ||
'ext': 'mp4', | ||
'title': 'ENC02', | ||
'duration': 19683.982, | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000', | ||
'info_dict': { | ||
'id': 'ENC18-722333096000-722335562000', | ||
'ext': 'mp4', | ||
'title': 'ENC18', | ||
'duration': 2463.995, | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000', | ||
'info_dict': { | ||
'id': 'triathlon2018-warsaw-550305000000-550327620000', | ||
'ext': 'mp4', | ||
'title': 'triathlon2018/warsaw', | ||
'duration': 22619.98, | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000', | ||
'only_matching': True, | ||
}, { | ||
'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000', | ||
'only_matching': True, | ||
}] | ||
|
||
""" | ||
Known methods (first in url path): | ||
- `livedash` - DASH MPD | ||
- `livehls` - HTTP Live Streaming | ||
- `livess` - IIS Smooth Streaming | ||
- `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac | ||
- `sc` - shoutcast/icecast (audio streams, like radio) | ||
""" | ||
|
||
def _real_extract(self, url): | ||
tenant, path = self._match_valid_url(url).group('tenant', 'id') | ||
qs = parse_qs(url) | ||
start_time = traverse_obj(qs, ('startTime', 0, {int_or_none})) | ||
stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none})) | ||
|
||
def livx_mode(mode): | ||
suffix = '' | ||
if mode == 'livess': | ||
suffix = '/manifest' | ||
elif mode == 'livehls': | ||
suffix = '/playlist.m3u8' | ||
file_qs = {} | ||
if start_time: | ||
file_qs['startTime'] = start_time | ||
if stop_time: | ||
file_qs['stopTime'] = stop_time | ||
if mode == 'nvr': | ||
file_qs['nolimit'] = 1 | ||
elif mode != 'sc': | ||
file_qs['indexMode'] = 'true' | ||
return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs) | ||
|
||
# no id or title for a transmission. making ones up. | ||
title = path \ | ||
.replace('/live', '').replace('live/', '') \ | ||
.replace('/channel', '').replace('channel/', '') \ | ||
.strip('/') | ||
video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time) | ||
|
||
formats = [] | ||
# downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata | ||
ism_res = self._download_xml_handle( | ||
livx_mode('livess'), video_id, | ||
note='Downloading ISM manifest', | ||
errnote='Failed to download ISM manifest', | ||
fatal=False) | ||
ism_doc = None | ||
if ism_res is not False: | ||
ism_doc, ism_urlh = ism_res | ||
formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss') | ||
|
||
nvr_urlh = self._request_webpage( | ||
HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False, | ||
expected_status=lambda _: True) | ||
if nvr_urlh and nvr_urlh.status == 200: | ||
formats.append({ | ||
'url': nvr_urlh.url, | ||
'ext': 'flv', | ||
'format_id': 'direct-0', | ||
'preference': -1, # might be slow | ||
}) | ||
formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False)) | ||
formats.extend(self._extract_m3u8_formats( | ||
livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False)) | ||
|
||
time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000 | ||
duration = traverse_obj( | ||
ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None | ||
|
||
live_status = None | ||
if traverse_obj(ism_doc, '@IsLive') == 'TRUE': | ||
live_status = 'is_live' | ||
elif duration: | ||
live_status = 'was_live' | ||
|
||
return { | ||
'id': video_id, | ||
'title': title, | ||
'formats': formats, | ||
'duration': duration, | ||
'live_status': live_status, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
import datetime | ||
|
||
from .common import InfoExtractor | ||
from .redge import RedCDNLivxIE | ||
from ..utils import ( | ||
clean_html, | ||
join_nonempty, | ||
js_to_json, | ||
strip_or_none, | ||
update_url_query, | ||
) | ||
from ..utils.traversal import traverse_obj | ||
|
||
|
||
def is_dst(date): | ||
last_march = datetime.datetime(date.year, 3, 31) | ||
last_october = datetime.datetime(date.year, 10, 31) | ||
last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7) | ||
last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7) | ||
return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3) | ||
|
||
|
||
def rfc3339_to_atende(date): | ||
date = datetime.datetime.fromisoformat(date) | ||
date = date + datetime.timedelta(hours=1 if is_dst(date) else 0) | ||
return int((date.timestamp() - 978307200) * 1000) | ||
|
||
|
||
class SejmIE(InfoExtractor): | ||
_VALID_URL = ( | ||
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)', | ||
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)', | ||
r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)', | ||
) | ||
IE_NAME = 'sejm' | ||
|
||
_TESTS = [{ | ||
# multiple cameras, polish SL iterpreter | ||
'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5', | ||
'info_dict': { | ||
'id': '6181EF1AD9CEEBB5C1258A6D006452B5', | ||
'title': '1. posiedzenie Sejmu X kadencji', | ||
'duration': 20145, | ||
'live_status': 'was_live', | ||
'location': 'Sala Posiedzeń', | ||
}, | ||
'playlist': [{ | ||
'info_dict': { | ||
'id': 'ENC01-722340000000-722360145000', | ||
'ext': 'mp4', | ||
'duration': 20145, | ||
'title': '1. posiedzenie Sejmu X kadencji - ENC01', | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'info_dict': { | ||
'id': 'ENC30-722340000000-722360145000', | ||
'ext': 'mp4', | ||
'duration': 20145, | ||
'title': '1. posiedzenie Sejmu X kadencji - ENC30', | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'info_dict': { | ||
'id': 'ENC31-722340000000-722360145000', | ||
'ext': 'mp4', | ||
'duration': 20145, | ||
'title': '1. posiedzenie Sejmu X kadencji - ENC31', | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
'info_dict': { | ||
'id': 'ENC32-722340000000-722360145000', | ||
'ext': 'mp4', | ||
'duration': 20145, | ||
'title': '1. posiedzenie Sejmu X kadencji - ENC32', | ||
'live_status': 'was_live', | ||
}, | ||
}, { | ||
# sign lang interpreter | ||
'info_dict': { | ||
'id': 'Migacz-ENC01-1-722340000000-722360145000', | ||
'ext': 'mp4', | ||
'duration': 20145, | ||
'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01', | ||
'live_status': 'was_live', | ||
}, | ||
}], | ||
}, { | ||
'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2', | ||
'info_dict': { | ||
'id': '9377A9D65518E9A5C125808E002E9FF2', | ||
'title': 'Debata "Lepsza Polska: obywatelska"', | ||
'description': 'KP .Nowoczesna', | ||
'duration': 8770, | ||
'live_status': 'was_live', | ||
'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)', | ||
}, | ||
'playlist': [{ | ||
'info_dict': { | ||
'id': 'ENC08-1-503831270000-503840040000', | ||
'ext': 'mp4', | ||
'duration': 8770, | ||
'title': 'Debata "Lepsza Polska: obywatelska" - ENC08', | ||
'live_status': 'was_live', | ||
}, | ||
}], | ||
}, { | ||
# 7th term is very special, since it does not use redcdn livx | ||
'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F', | ||
'info_dict': { | ||
'id': 'A6E6D475ECCC6FE5C1257EF90034817F', | ||
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', | ||
'description': 'SLD - Biuro Prasowe Klubu', | ||
'duration': 514, | ||
'location': 'sala 101/bud. C', | ||
'live_status': 'was_live', | ||
}, | ||
'playlist': [{ | ||
'info_dict': { | ||
'id': 'A6E6D475ECCC6FE5C1257EF90034817F', | ||
'ext': 'mp4', | ||
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu', | ||
'duration': 514, | ||
}, | ||
}], | ||
}, { | ||
'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492', | ||
'only_matching': True, | ||
}] | ||
|
||
def _real_extract(self, url): | ||
term, video_id = self._match_valid_url(url).group('term', 'id') | ||
frame = self._download_webpage( | ||
f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}', | ||
video_id) | ||
# despite it says "transmisje_arch", it works for live streams too! | ||
data = self._download_json( | ||
f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}', | ||
video_id) | ||
params = data['params'] | ||
|
||
title = strip_or_none(data.get('title')) | ||
|
||
if data.get('status') == 'VIDEO_ENDED': | ||
live_status = 'was_live' | ||
elif data.get('status') == 'VIDEO_PLAYING': | ||
live_status = 'is_live' | ||
else: | ||
live_status = None | ||
self.report_warning(f'unknown status: {data.get("status")}') | ||
|
||
start_time = rfc3339_to_atende(params['start']) | ||
# current streams have a stop time of *expected* end of session, but actual times | ||
# can change during the transmission. setting a stop_time would artificially | ||
# end the stream at that time, while the session actually keeps going. | ||
if live_status == 'was_live': | ||
stop_time = rfc3339_to_atende(params['stop']) | ||
duration = (stop_time - start_time) // 1000 | ||
else: | ||
stop_time, duration = None, None | ||
|
||
entries = [] | ||
|
||
def add_entry(file, legacy_file=False): | ||
if not file: | ||
return | ||
file = self._proto_relative_url(file) | ||
if not legacy_file: | ||
file = update_url_query(file, {'startTime': start_time}) | ||
if stop_time is not None: | ||
file = update_url_query(file, {'stopTime': stop_time}) | ||
stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id') | ||
common_info = { | ||
'url': file, | ||
'duration': duration, | ||
} | ||
if legacy_file: | ||
entries.append({ | ||
**common_info, | ||
'id': video_id, | ||
'title': title, | ||
}) | ||
else: | ||
entries.append({ | ||
**common_info, | ||
'_type': 'url_transparent', | ||
'ie_key': RedCDNLivxIE.ie_key(), | ||
'id': stream_id, | ||
'title': join_nonempty(title, stream_id, delim=' - '), | ||
}) | ||
|
||
cameras = self._search_json( | ||
r'var\s+cameras\s*=', frame, 'camera list', video_id, | ||
contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json, | ||
fatal=False) or [] | ||
for camera_file in traverse_obj(cameras, (..., 'file', {dict})): | ||
if camera_file.get('flv'): | ||
add_entry(camera_file['flv']) | ||
elif camera_file.get('mp4'): | ||
# this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx | ||
add_entry(camera_file['mp4'], legacy_file=True) | ||
else: | ||
self.report_warning('Unknown camera stream type found') | ||
|
||
if params.get('mig'): | ||
add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False)) | ||
|
||
return { | ||
'_type': 'playlist', | ||
'entries': entries, | ||
'id': video_id, | ||
'title': title, | ||
'description': clean_html(data.get('desc')) or None, | ||
'duration': duration, | ||
'live_status': live_status, | ||
'location': strip_or_none(data.get('location')), | ||
} |