Skip to content

Commit

Permalink
Added video size and fixed tags
Browse files Browse the repository at this point in the history
  • Loading branch information
ZuluPro committed Dec 30, 2016
1 parent 15a26f0 commit 6fc8383
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 174 deletions.
1 change: 1 addition & 0 deletions requirements.txt
@@ -1,3 +1,4 @@
beautifulsoup4
pdfminer
chardet
Pillow
4 changes: 2 additions & 2 deletions web_rich_object/__init__.py
Expand Up @@ -4,10 +4,10 @@
except ImportError:
pass

VERSION = (0, 2, 2)
VERSION = (0, 3, 0)
__version__ = '.'.join([str(i) for i in VERSION])
__author__ = 'Anthony Monthe (ZuluPro)'
__email__ = 'anthony.monthe@gmail.com'
__url__ = 'https://github.com/ZuluPro/web-rich-object'
__license__ = 'BSD'
__keywords__ = ['web rich object', 'opengraph', 'facebook', 'web']
__keywords__ = ['web rich object', 'opengraph', 'facebook', 'web', 'twitter']
232 changes: 62 additions & 170 deletions web_rich_object/api.py
Expand Up @@ -164,6 +164,7 @@ def type(self):
self.contextly_info.get('type')):
type_ = self._valid_string(self.contextly_info['type'])
self._type = self.contextly_info['type']

else:
self._type = self.info.get('type')
if self._type:
Expand Down Expand Up @@ -388,18 +389,66 @@ def video(self):
if og_video_tag is not None:
self._video = og_video_tag.attrs['content']
# From HTML5 video_tag
video_tag = self.soup.find('video')
if video_tag is not None:
source_tag = video_tag.find('source')
if source_tag is not None:
self._video = source_tag.attrs['src']
if self._video is None:
video_tag = self.soup.find('video')
if video_tag is not None:
source_tag = video_tag.find('source')
if source_tag is not None:
self._video = source_tag.attrs['src']
elif self.info['maintype'] == 'video':
self._video = self.base_url
# Format URL
if self._video is not None and not self._video.startswith('http'):
self._video = self._format_url(self._video)
return self._video

@property
def video_width(self):
if not hasattr(self, '_video_width'):
self._video_width = None
if self.subtype == 'html' and self.soup.find():
# From open graph
og_video_tag = self.soup.find('meta', property='og:video:width')
if og_video_tag is not None:
self._video_width = og_video_tag.attrs['content']
return self._video_width

@property
def video_height(self):
if not hasattr(self, '_video_height'):
self._video_height = None
if self.subtype == 'html' and self.soup.find():
# From open graph
og_video_tag = self.soup.find('meta', property='og:video:height')
if og_video_tag is not None:
self._video_height = og_video_tag.attrs['content']
return self._video_height

@property
def video_duration(self):
if not hasattr(self, '_video_duration'):
self._video_duration = None
if self.subtype == 'html' and self.soup.find():
# From open graph
og_video_tag = self.soup.find('meta', property='og:video:duration')
if og_video_tag is not None:
self._video_duration = og_video_tag.attrs['content']
return self._video_duration

@property
def video_info(self):
if not hasattr(self, '_struct_video'):
self._struct_video = {}
if self.video:
self._struct_video['url'] = self.video
if self.video_width:
self._struct_video['width'] = self.video_width
if self.video_height:
self._struct_video['height'] = self.video_height
if self.video_duration:
self._struct_video['duration'] = self.video_duration
return self._struct_video

@property
def images(self):
if not hasattr(self, '_images'):
Expand Down Expand Up @@ -586,6 +635,14 @@ def tags(self):
if (not self._tags and self.contextly_info and
self.contextly_info.get('tags')):
self._tags = self.contextly_info['tags']
# From meta keywords
if not self._tags:
keywords_tag = self.soup.find('meta', attrs={'name': 'keywords'})
if keywords_tag is not None:
keywords = [k.strip() for k in keywords_tag.attrs['content'].split(',')]
self._tags.extend(keywords)
self._tags = [self._valid_string(t) for t in self._tags
if self._valid_string(t)]
return self._tags

@property
Expand All @@ -611,168 +668,3 @@ def struct_image(self):
elif img_meta.attrs['property'] == 'og:image:secure_url':
self._struct_image['secure_url'] = img_meta.attrs['content']
return self._struct_image

@property
def struct_video(self):
if not hasattr(self, '_struct_video'):
_video = self.soup.find('meta', property='og:video')
self._struct_video = {}
if _video is not None:
self._struct_video = {
'url': _video.attrs['content'],
}
next_vid_metas = _video.find_next_siblings('meta',
property=True)
for vid_meta in next_vid_metas:
if vid_meta.attrs['property'] == 'og:video':
break
elif vid_meta.attrs['property'] == 'og:video:width':
self._struct_video['width'] = vid_meta.attrs['content']
elif vid_meta.attrs['property'] == 'og:video:height':
self._struct_video['height'] = vid_meta.attrs['content']
elif vid_meta.attrs['property'] == 'og:video:type':
self._struct_video['type'] = vid_meta.attrs['content']
elif vid_meta.attrs['property'] == 'og:video:secure_url':
self._struct_video['secure_url'] = vid_meta.attrs['content']
return self._struct_video

@property
def struct_audio(self):
if not hasattr(self, '_struct_audio'):
_audio = self.soup.find('meta', property='og:audio')
self._struct_audio = {}
if _audio is not None:
self._struct_audio = {
'url': _audio.attrs['content'],
}
next_aud_metas = _audio.find_next_siblings('meta',
property=True)
for aud_meta in next_aud_metas:
if aud_meta.attrs['property'] == 'og:audio':
break
elif aud_meta.attrs['property'] == 'og:audio:type':
self._struct_audio['type'] = aud_meta.attrs['content']
elif aud_meta.attrs['property'] == 'og:audio:secure_url':
self._struct_audio['secure_url'] = aud_meta.attrs['content']
return self._struct_audio

@property
def obj_music_song(self):
if not hasattr(self, '_obj_music_song'):
self._obj_music_song = {}
keys = ('duration', 'album', 'album:disc', 'album:track',
'musician')
for key in keys:
key_tag = self.soup.find('meta', property='music:' + key)
if key_tag is not None:
self._music_song[key.replace(':', '')] = key_tag.attrs['content']
self._obj_music_song = {} or None
return self._obj_music_song

@property
def obj_music_album(self):
if not hasattr(self, '_obj_music_album'):
self._music_album = {}
keys = ('song', 'song:disc', 'song:track', 'release_date',
'musician')
for key in keys:
key_tag = self.soup.find('meta', property='music:' + key)
if key_tag is not None:
self._music_album[key.replace(':', '')] = key_tag.attrs['content']
self._obj_music_album = {} or None
return self._obj_music_album

@property
def obj_music_playlist(self):
if not hasattr(self, '_obj_music_playlist'):
self._obj_music_playlist = {}
keys = ('song', 'song:disc', 'song:track', 'creator')
for key in keys:
key_tag = self.soup.find('meta', property='music:' + key)
if key_tag is not None:
self._obj_music_playlist[key.replace(':', '')] = key_tag.attrs['content']
self._obj_music_playlist = {} or None
return self._obj_music_playlist

@property
def obj_music_radio_station(self):
if not hasattr(self, '_obj_radio_station'):
creator = self.soup.find('meta', property='music:creator')
if creator is None:
self._music_radio_station = None
else:
self._obj_music_radio_station = creator.attrs['content']
return self._obj_music_radio_station

@property
def obj_video_movie(self):
if not hasattr(self, '_obj_video_movie'):
self._obj_video_movie = {}
keys = ('song', 'song:disc', 'song:track', 'creator')
for key in keys:
key_tag = self.soup.find('meta', property='video:' + key)
if key_tag is not None:
self._obj_video_movie[key.replace(':', '')] = key_tag.attrs['content']
self._obj_video_movie = {} or None
return self._obj_video_movie

@property
def obj_article(self):
if not hasattr(self, '_obj_article'):
self._obj_article = {}

keys = ('published_time', 'modified_time', 'expiration_time',
'section')
for key in keys:
key_tag = self.soup.find('meta', property='article:' + key)
if key_tag is not None:
self._obj_article[key.replace(':', '')] = key_tag.attrs['content']

array_keys = ('author', 'tag')
for key in array_keys:
key_tags = self.soup.find_all('meta', property='article:' + key)
for key_tag in key_tags:
_key = key.replace(':', '')
if _key not in self._obj_article:
self._obj_article[_key] = []
self._obj_article[_key] = key_tag.attrs['content']

self._obj_article = {} or None
return self._obj_article

@property
def obj_book(self):
if not hasattr(self, '_obj_book'):
self._obj_book = {}

keys = ('isbn', 'release_date')
for key in keys:
key_tag = self.soup.find('meta', property='book:' + key)
if key_tag is not None:
self._obj_book[key.replace(':', '')] = key_tag.attrs['content']

array_keys = ('author', 'tag')
for key in array_keys:
key_tags = self.soup.find_all('meta', property='book:' + key)
for key_tag in key_tags:
_key = key.replace(':', '')
if _key not in self._obj_book:
self._obj_book[_key] = []
self._obj_book[_key].append(key_tag.attrs['content'])

self._obj_book = self._obj_book or None
return self._obj_book

@property
def obj_profile(self):
if not hasattr(self, '_obj_profile'):
self._obj_profile = {}

keys = ('first_name', 'last_name', 'username', 'gender')
for key in keys:
key_tag = self.soup.find('meta', property='profile:' + key)
if key_tag is not None:
self._obj_profile[key.replace(':', '')] = key_tag.attrs['content']

self._obj_profile = self._obj_profile or None
return self._obj_profile
5 changes: 3 additions & 2 deletions web_rich_object/functional_tests.py
Expand Up @@ -29,9 +29,10 @@
('http://www.pdf995.com/samples/pdf.pdf', {'title': 'PDF', 'url': 'http://www.pdf995.com/samples/pdf.pdf', 'site_name': 'www.pdf995.com', 'type': 'application', 'subtype': 'pdf', 'tags': ['pdf,', 'create', 'pdf,', 'software,', 'acrobat,', 'adobe']}),
# ('http://www.cbu.edu.zm/downloads/pdf-sample.pdf', {'title': 'This is a test PDF file', 'url': 'http://www.cbu.edu.zm/downloads/pdf-sample.pdf', 'site_name': 'www.cbu.edu.zm', 'type': 'application', 'subtype': 'pdf', 'tags': []}),
# Video HTML
('https://www.youtube.com/watch?v=4nzaATIOAAE', {'title': 'Sir Samuel - Urban Classik [CLIP OFFICIEL]', 'url': 'https://www.youtube.com/watch?v=4nzaATIOAAE', 'site_name': 'YouTube', 'type': 'video', 'subtype': 'html', 'image': 'https://i.ytimg.com/vi/4nzaATIOAAE/maxresdefault.jpg', 'generator': None, 'author': None, 'tags': [], 'video': 'https://www.youtube.com/embed/4nzaATIOAAE'}),
('https://www.youtube.com/watch?v=4nzaATIOAAE', {'title': 'Sir Samuel - Urban Classik [CLIP OFFICIEL]', 'url': 'https://www.youtube.com/watch?v=4nzaATIOAAE', 'site_name': 'YouTube', 'type': 'video', 'subtype': 'html', 'image': 'https://i.ytimg.com/vi/4nzaATIOAAE/maxresdefault.jpg', 'generator': None, 'author': None, 'tags': ['sir', 'samuel', 'teaser', 'gallery', 'miroir', 'aime', 'casquette', 'reggae', 'ragga', 'dub', 'hip', 'hop', 'rap', 'saian', 'supa', 'crew', 'fefe', 'mental', 'offishall', 'secher', 'larmes', 'urban', '...'], 'video': 'https://www.youtube.com/embed/4nzaATIOAAE'}),
('http://www.dailymotion.com/video/x2h2pgt_yannick-van-doorne-l-electroculture-une-technologie-d-avenir-meta-tv-1-4_tv', {'title': "Yannick Van Doorne - L'\xe9lectroculture une technologie d'avenir - Meta TV 3/4 - vid\xe9o Dailymotion", 'url': 'http://www.dailymotion.com/video/x2h2pgt', 'site_name': 'Dailymotion', 'type': 'video', 'subtype': 'html', 'image': 'http://s1.dmcdn.net/I8M_O/526x297-FRU.jpg', 'generator': None, 'author': None, 'tags': ['Agriculture'], 'video': 'http://www.dailymotion.com/embed/video/x2h2pgt?autoplay=1'}),
('https://vine.co/v/i525v6rlxPA', {'title': 'other guys vs me', 'url': 'https://vine.co/v/i525v6rlxPA', 'type': 'video', 'subtype': 'html', 'image': 'https://v.cdn.vine.co/r/videos/17150F7AAA1305700433301770240_47e22cb821d.5.1.5396368302495662565.mp4.jpg?versionId=TeHU5w.JHRPaMKOGTXWluoiPdLh7sEaG', 'generator': None, 'author': None, 'tags': [], 'video': 'http://vine.co/v/i525v6rlxPA/fb-card?audio=1'}),

('http://www.koreus.com/video/faux-professeur-chimie.html', {'title': 'Un faux professeur le premier jour de cours (Blague)', 'url': 'http://www.koreus.com/video/faux-professeur-chimie.html', 'type': 'video.other', 'subtype': 'html', 'image': 'http://thumbshigh.koreus.com/201309/faux-professeur-chimie.jpg', 'generator': None, 'author': None, 'tags': ['uid Vid\xe9o', 'amphi', 'blague', 'chimie', 'cours', 'faux', 'professeur', 'vostfr', 'clip', 'fun', 'jeu', 'divertissement', 'loisir', 'humour', 'animation', 'gratuit'], 'video': 'http://www.koreus.com/video/faux-professeur-chimie/autostart', 'video_width': '1280', 'video_height': '720'}),
# ('https://vimeo.com/39075039', {'title': 'Nike SB: Eric Koston - Mr. Control It All', 'url': 'https://vimeo.com/39075039', 'type': 'video', 'subtype': 'html', 'image': 'https://i.vimeocdn.com/video/269350328_1280x720.jpg', 'generator': None, 'author': None, 'tags': ['Nike', 'Nike SB', 'Nike Skateboarding', 'skate', 'skateboard', 'skateboarding', 'Koston', 'Eric Koston', 'Malto', 'Sean Malto', 'Justin Brock', 'Mr. Control It All'], 'video': 'https://player.vimeo.com/video/39075039?autoplay=1'}),
# Page with HTML5 video
('http://www.w3schools.com/html/html5_video.asp', {'title': "HTML5 Video", 'url': 'http://www.w3schools.com/html/html5_video.asp', 'type': 'website', 'subtype': 'html', 'image': 'http://www.w3schools.com/favicon.ico', 'generator': None, 'author': None, 'tags': [], 'video': 'http://www.w3schools.com/html/mov_bbb.mp4'}),
Expand Down
66 changes: 66 additions & 0 deletions web_rich_object/tests/test_html.py
Expand Up @@ -209,6 +209,72 @@ def test_from_og_video(self):
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}

def test_from_og_video_url(self):
wro = WRO(self.url)
self.assertEqual(wro.video, 'http://example.com/foo.mp4')
test_from_og_video_url.mock_attrs = {
'return_value.read.return_value': '<html><meta property="og:video:url" content="foo.mp4"/></html>',
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}

def test_from_og_video_secure_url(self):
wro = WRO(self.url)
self.assertEqual(wro.video, 'http://example.com/foo.mp4')
test_from_og_video_secure_url.mock_attrs = {
'return_value.read.return_value': '<html><meta property="og:video:secure_url" content="foo.mp4"/></html>',
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}

def test_from_html5_tag(self):
wro = WRO(self.url)
self.assertEqual(wro.video, 'http://example.com/foo.mp4')
test_from_html5_tag.mock_attrs = {
'return_value.read.return_value': '<html><video><source src="foo.mp4"></source></video></html>',
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}


class WebRichObjectVideoWidthTest(utils.BaseWebRichObjectTestCase):
def test_from_og_video_width(self):
wro = WRO(self.url)
self.assertEqual(wro.video_width, '1280')
test_from_og_video_width.mock_attrs = {
'return_value.read.return_value': '<html><meta property="og:video:width" content="1280"/></html>',
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}


class WebRichObjectVideoHeightTest(utils.BaseWebRichObjectTestCase):
def test_from_og_video_height(self):
wro = WRO(self.url)
self.assertEqual(wro.video_height, '720')
test_from_og_video_height.mock_attrs = {
'return_value.read.return_value': '<html><meta property="og:video:height" content="720"/></html>',
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}


VIDEO_INFO = """<html>
<meta property="og:video" content="foo.mp4"/>
<meta property="og:video:width" content="1280"/>
<meta property="og:video:height" content="720"/>
</html>"""


class WebRichObjectVideoInfoTest(utils.BaseWebRichObjectTestCase):
def test_property(self):
wro = WRO(self.url)
video_info = {
'url': 'http://example.com/foo.mp4',
'width': '1280',
'height': '720'
}
self.assertEqual(wro.video_info, video_info)
test_property.mock_attrs = {
'return_value.read.return_value': VIDEO_INFO,
'return_value.info.return_value.__dict__': utils.HTML_RESPONSE_INFO,
}


class WebRichObjectImagesTest(utils.BaseWebRichObjectTestCase):
def test_from_og_images(self):
Expand Down

0 comments on commit 6fc8383

Please sign in to comment.