# Парсер информации о мемах

[Know Your Meme](https://knowyourmeme.com/) — сайт, посвящённый описанию интернет-мемов.

In [1]:
from requests import get
import numpy as np
import pandas as pd
import time

In [2]:
page_link = 'https://knowyourmeme.com/'

In [3]:
response = get(page_link)
response

<Response [403]>

Ошибка, так как сайт имеет дополнительную защиту. Попробуем использовать fake_useragent, чтобы сделать запрос похожим на "человеческий"

In [4]:
response.request.headers

{'User-Agent': 'python-requests/2.26.0', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [5]:
!pip install fake_useragent

Collecting fake_useragent
  Downloading fake_useragent-1.1.1-py3-none-any.whl (50 kB)
Collecting importlib-resources>=5.0
  Downloading importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Installing collected packages: importlib-resources, fake-useragent
Successfully installed fake-useragent-1.1.1 importlib-resources-5.12.0


In [6]:
from fake_useragent import UserAgent

In [9]:
UserAgent().chrome

'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0'

In [15]:
response = get(page_link, headers={'User-Agent': UserAgent().chrome})
response

<Response [200]>

In [None]:
html = response.content

__Подгрузим необходимую библиотеку__

In [12]:
pip install beautifulsoup4




In [13]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(html, 'html.parser')

__Посмотрим отдельно на части сайта, используя его разметку__

In [20]:
soup.html.head.title

<title>Internet Meme Database | Know Your Meme</title>

In [21]:
soup.html.title.text

'Internet Meme Database | Know Your Meme'

In [23]:
x = soup.find('a', {'class': 'newsfeed-title'})
x.text

'Waffle Store That Sells Erotic-Shaped Food Goes Viral In Colombia'

In [26]:
x = soup.find_all('a', {'class': 'newsfeed-title'})
[item.text for item in x]

['Waffle Store That Sells Erotic-Shaped Food Goes Viral In Colombia',
 'All Star Voice Actress And Singer Rie Takahashi Implores Fans To Shower Before Her Concert',
 'The Real Reason They Want To Lock Up Young Thug',
 'This Image Of A Girl Standing Over A Dejected Boy Has Become An Exploitable Meme Template',
 '20 Familiar Images To Give You Nostalgia']

In [28]:
x = soup.find('a', {'class': 'newsfeed-title'})
x.get('href')

'/memes/severo-sinverguenza'

In [38]:
x = soup.find_all('a', {'class': 'newsfeed-title'})
meme_links = [item.get('href') for item in x]
meme_links[:3]

['/memes/severo-sinverguenza',
 '/news/all-star-voice-actress-and-singer-rie-takahashi-implores-fans-to-shower-before-her-concert',
 '/photos/2546034-in-possession-of-an-ancient-staff']

## Собираем парсер

__Функция получения ссылок на мемы__

In [40]:
def getPageLinks(page_number):
    page_link = 'https://knowyourmeme.com/page/{}'.format(page_number)
    response = get(page_link, headers = {'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return []
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    
    meme_links = soup.find_all('a', {'class': 'newsfeed-title'})
    meme_links = ['https://knowyourmeme.com' + link.get('href') for link in meme_links]
    
    return meme_links

In [43]:
meme_links = getPageLinks(200)
meme_links[:2]

['https://knowyourmeme.com/photos/2519452-crossover',
 'https://knowyourmeme.com/editorials/collections/15-history-memes-to-distract-from-the-present']

In [49]:
meme_page = 'https://knowyourmeme.com/memes/doge'
response = get(meme_page, headers={'User-Agent': UserAgent().chrome})

html = response.content
soup = BeautifulSoup(html, 'html.parser')

In [50]:
views = soup.find('dd', {'class':'views'})
views

<dd class="views" title="14,029,460 Views">
<a href="/memes/doge" rel="nofollow">14,029,460</a>
</dd>

In [51]:
views = views.find('a').text
views

'14,029,460'

In [52]:
views = int(views.replace(',', ''))
views

14029460

In [53]:
views = soup.find('dd', {'class':'videos'})
views

<dd class="videos" title="104 Videos">
<a href="/memes/doge#videos" rel="nofollow">104</a>
</dd>

In [54]:
views = views.find('a').text
views

'104'

__Парсинг показателей (просмотры, комментарии и тд)__

In [55]:
def getStats(soup, stats):
    try:
        obj = soup.find('dd', {'class': stats})
        obj = obj.find('a').text
        obj = int(obj.replace(',', ''))
    except:
        obj = None
    return obj

In [56]:
views = getStats(soup, stats='views')
videos = getStats(soup, stats='videos')
photos = getStats(soup, stats='photos')
comments = getStats(soup, stats='comments')
print('Просмотры: {}\nВидео: {}\nФото: {}\nКомментарии: {}'.format(views, videos, photos, comments))

Просмотры: 14029460
Видео: 104
Фото: 1791
Комментарии: 923


In [61]:
date = soup.find('abbr', attrs={'class': 'timeago'}).attrs['title']
date

'2023-03-06T15:03:39-05:00'

In [66]:
info = soup.find('aside', {'class': 'left'})
meme_status = info.dl.dd.text.strip()

'Confirmed'

In [67]:
#избегаем ошибок при поиске
# например оказалось, что meme_status = None
meme_status = None
try:
    print(meme_status.text.strip())
    
except:
    print('Exception')
    
if meme_status:
    print(meme_status.text.strip())
else:
    print('Empty')

Exception
Empty


__Функция для описания харктеристик мема (название, статус, тип, год добавления и тд)__

In [75]:
def getProperties(soup):
    meme_name = soup.find('section', attrs={'class': 'info'}).find('h1').text.strip()
    
    properties = soup.find('aside', attrs = {'class': 'left'})
    
    meme_status = properties.find('dd')
    meme_status = '' if not meme_status else meme_status.text.strip()
    
    meme_type = properties.find('a', attrs = {'class': 'entry-type-link'})
    meme_type = '' if not meme_type else meme_type.text
    
    meme_origin_year = properties.find(text='\nYear\n')
    meme_origin_year = '' if not meme_origin_year else meme_origin_year.parent.find_next()
    meme_origin_year = meme_origin_year.text.strip()
    
    meme_origin_place = properties.find('dd', attrs={'class': 'entry_origin_link'})
    meme_origin_place = '' if not meme_origin_place else meme_origin_place.text.strip()
    
    meme_tags = properties.find('dl', attrs={'id': 'entry_tags'}).find('dd')
    meme_tags = '' if not meme_tags else meme_tags.text.strip()
    
    return meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags

In [76]:
getProperties(soup)

('Doge',
 'Confirmed',
 'Animal',
 '2010',
 'Tumblr',
 'animal, dog, shiba inu, shibe, such doge, super shibe, japanese, tumblr, comic sans, photoshop meme, doges, dogges, reddit, bitcoin, dogecoin, canine, doge meme, atsuko sato, kabosu, doge memes, dogelore, kabosumama')

In [77]:
def getProperties(soup):
    """
        Возвращает список (tuple) с названием, статусом, типом, 
        годом и местом происхождения и тэгами
        
        soup: объект bs4.BeautifulSoup 
            представление текущей страницы
    
    """
    # название - идёт с самым большим заголовком h1
    meme_name = soup.find('section', attrs={'class':'info'}).find('h1').text.strip()
    
    # достаём все данные справа от картинки 
    properties = soup.find('aside', attrs={'class':'left'})
    
    # статус идет первым - можно не уточнять класс
    meme_status = properties.find("dd")
    
    # oneliner, заменяющий try-except: если тэга нет в properties, вернётся объект NoneType,
    # у которого аттрибут text отсутствует, и в этом случае он заменится на пустую строку
    meme_status = "" if not meme_status else meme_status.text.strip()
    
    # тип мема - обладает уникальным классом
    meme_type = properties.find('a', attrs={'class':'entry-type-link'})
    meme_type = "" if not meme_type else meme_type.text 
    
    # год происхождения первоисточника можно найти после заголовка Year, 
    # находим заголовок, определяем родителя и ищем следущего за родителем - наш раздел
    meme_origin_year = properties.find(text='\nYear\n')
    meme_origin_year = "" if not meme_origin_year else meme_origin_year.parent.find_next()
    meme_origin_year = meme_origin_year.text.strip()
    
    # сам первоисточник
    meme_origin_place = properties.find('dd', attrs={'class':'entry_origin_link'})
    meme_origin_place = "" if not meme_origin_place else meme_origin_place.text.strip()
    
    # тэги, связанные с мемом
    meme_tags = properties.find('dl', attrs={'id':'entry_tags'}).find('dd')
    meme_tags = "" if not meme_tags else meme_tags.text.strip()
    
    return meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags

In [78]:
getProperties(soup)

('Doge',
 'Confirmed',
 'Animal',
 '2010',
 'Tumblr',
 'animal, dog, shiba inu, shibe, such doge, super shibe, japanese, tumblr, comic sans, photoshop meme, doges, dogges, reddit, bitcoin, dogecoin, canine, doge meme, atsuko sato, kabosu, doge memes, dogelore, kabosumama')

__Функция для получения описания мема__

In [92]:
def getText(soup):
    """
        Возвращает текстовые описания мема
        
        soup: объект bs4.BeautifulSoup 
            представление текущей страницы
            
    """
    
    # достаём все тексты под картинкой
    body = soup.find('section', attrs={'class':'bodycopy'})
    
    # раздел about (если он есть), должен идти первым, берем его без уточнения класса
    meme_about = body.find('p')
    meme_about = "" if not meme_about else meme_about.text
    
    # раздел origin можно найти после заголовка Origin или History, 
    # находим заголовок, определяем родителя и ищем следущего ребенка - наш раздел
    meme_origin = body.find(text='Origin') or body.find(text='History')
    meme_origin = "" if not meme_origin else meme_origin.parent.find_next().text
    
    # весь остальной текст (если он есть) можно положить в одно текстовое поле
    if body.text:
        other_text = body.text.strip().split('\n')[5:]
        other_text = " ".join(other_text).strip()
    else:
        other_text = ""
        
    return meme_about, meme_origin, other_text

In [93]:
meme_about, meme_origin, other_text = getText(soup)

print("О чем мем:\n{}\n\nПроисхождение:\n{}\n\nОстальной текст:\n{}...\n"\
      .format(meme_about, meme_origin, other_text[:200]))

О чем мем:
Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang term for "dog" that is primarily associated with pictures of Shiba Inus (nicknamed "Shibe") and internal monologue captions on Tumblr. These photos may be photoshopped to change the dog's face or captioned with interior monologues in Comic Sans font. The primary meme and iconography associated with Doge is the Shiba Inu named Kabosu, whose photos taken by her owner Atsuko Sato in early 2010 went viral across the internet, spawning numerous memes and larger trends in the following decades. Starting in 2017, Ironic Doge formats gained prevalence over the original wholesome version as the memetic character continued to evolve.

Происхождение:
The use of the misspelled word "doge" to refer to a dog dates back to June 24th, 2005, when it was mentioned in an episode of Homestar Runner's puppet show. In the episode titled "Biz Cas Fri 1"[2], Homestar calls Strong Bad his "d-o-g-e" while trying to distract him from his work.

Остальной текс

## Итоговый код парсера

In [102]:
def getMemeData(meme_page):
    response = get(meme_page, headers = {'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return response.status_code
    
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    
    views = getStats(soup, stats='views')
    videos = getStats(soup, stats='videos')
    photos = getStats(soup, stats='photos')
    comments = getStats(soup, stats='comments')
    
    date = soup.find('abbr', attrs={'class': 'timeago'}).attrs['title']
    
    meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags = getProperties(soup=soup)
    
    meme_about, meme_origin, other_text = getText(soup=soup)

    data_row = {"name":meme_name, "status":meme_status, 
                "type":meme_type, "origin_year":meme_origin_year, 
                "origin_place":meme_origin_place,
                "date_added":date, "views":views, 
                "videos":videos, "photos":photos, "comments":comments, "tags":meme_tags,
                "about":meme_about, "origin":meme_origin, "other_text":other_text}
    return data_row

In [None]:
data_row = getMemeData('https://knowyourmeme.com/memes/doge')

In [104]:
final_df = pd.DataFrame(columns=['name', 'status', 'type', 'origin_year', 'origin_place',
                                 'date_added', 'views', 'videos', 'photos', 'comments', 
                                 'tags', 'about', 'origin', 'other_text'])

In [106]:
final_df = final_df.append(data_row, ignore_index=True)
final_df

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Doge,Confirmed,Animal,2010,Tumblr,2023-03-06T15:03:39-05:00,14029782,104,1791,923,"animal, dog, shiba inu, shibe, such doge, supe...",Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang ter...,"The use of the misspelled word ""doge"" to refer...","IdentityOn February 13th, 2010, Japanese kinde..."


In [107]:
def getPageLinks(page_number):
    page_link = 'https://knowyourmeme.com/page/{}'.format(page_number)
    response = get(page_link, headers = {'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return []
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    
    meme_links = soup.find_all('a', {'class': 'newsfeed-title'})
    meme_links = ['https://knowyourmeme.com' + link.get('href') for link in meme_links]
    
    return meme_links

In [108]:
from tqdm import tqdm_notebook

In [109]:
meme_links

['https://knowyourmeme.com/photos/2519452-crossover',
 'https://knowyourmeme.com/editorials/collections/15-history-memes-to-distract-from-the-present',
 'https://knowyourmeme.com/memes/got-any-decent-fatherly-advice-for-me',
 'https://knowyourmeme.com/news/the-latest-target-for-conservatives-in-the-anti-woke-agenda-is-xbox',
 'https://knowyourmeme.com/memes/me-peacefully-passes-away-at-90-years-old-x-the-next-day']

In [110]:
for meme_link in tqdm_notebook(meme_links):
    try:
        data_row = getMemeData(meme_link)
        final_df = final_df.append(data_row, ignore_index=True)
    except:
        print(meme_link)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for meme_link in tqdm_notebook(meme_links):


  0%|          | 0/5 [00:00<?, ?it/s]

https://knowyourmeme.com/photos/2519452-crossover
https://knowyourmeme.com/editorials/collections/15-history-memes-to-distract-from-the-present
https://knowyourmeme.com/news/the-latest-target-for-conservatives-in-the-anti-woke-agenda-is-xbox


In [None]:
# общий цикл

from tqdm import tqdm_notebook

final_df = pd.DataFrame(columns = ['name', 'status', 'type', 'origin_year', 'origin_place',
                                 'date_added', 'views', 'videos', 'photos', 'comments', 
                                 'tags', 'about', 'origin', 'other_text'])

for page_number in tqdm_notebook(range(20), desc = 'Pages'):
    
    meme_links = getPageLinks(page_number)
    
    for meme_link in tqdm_notebook(meme_links, desc = 'Memes', leave=False):
    
        for i in range(1):
            try:
                data_row = getMemeData(meme_link)
                final_df = final_df.append(data_row, ignore_index=True)
                break
            except:
                print('AHTUNG! parsing once again:', meme_link)
                continue

In [119]:
final_df.shape

(52, 14)

__Итоговая таблица__

In [121]:
final_df

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Mario Dandry Satrio / Son Of Indonesian Tax Of...,Submission,Crime,2023,Indonesia,2023-03-07T02:42:41-05:00,246,0,6,0,"indonesia, tax official, crime, beating, guy",Son Of Indonesian Tax Official Beating A Man r...,,Dandy agnes lo itu psikopat 😠😠 gilakkk biadab ...
1,Los Polinesios,Submission,Comedian,2012,Mexico,2023-03-08T16:25:59-05:00,201,9,5,0,"los polinesios, youtube, youtuber, latin ameri...",Los Polinesios is a trio of Mexican YouTuber s...,,"According to Uno Tv[5], the name of the channe..."
2,Man,Confirmed,Exploitable,2020,Reddit,2023-03-07T01:59:50-05:00,116035,0,42,7,"horse, beach, man, horse man",Man refers to an image macro of a horse on a b...,"On April 20th, 2020, Reddit[1] user an__dy upl...",SpreadThe image was popularized in 2020 with m...
3,Mario Dandry Satrio / Son Of Indonesian Tax Of...,Submission,Crime,2023,Indonesia,2023-03-07T02:42:41-05:00,246,0,6,0,"indonesia, tax official, crime, beating, guy",Son Of Indonesian Tax Official Beating A Man r...,,Dandy agnes lo itu psikopat 😠😠 gilakkk biadab ...
4,Los Polinesios,Submission,Comedian,2012,Mexico,2023-03-08T16:25:59-05:00,201,9,5,0,"los polinesios, youtube, youtuber, latin ameri...",Los Polinesios is a trio of Mexican YouTuber s...,,"According to Uno Tv[5], the name of the channe..."
5,Man,Confirmed,Exploitable,2020,Reddit,2023-03-07T01:59:50-05:00,116035,0,42,7,"horse, beach, man, horse man",Man refers to an image macro of a horse on a b...,"On April 20th, 2020, Reddit[1] user an__dy upl...",SpreadThe image was popularized in 2020 with m...
6,'Real Emo' Copypasta,Submission,Copypasta,2017,Facebook,2023-03-08T14:20:58-05:00,318,1,4,0,"emo, copypasta, fake emo, real emo, dc, emotio...","""Real Emo"" Copypasta refers to a screed made i...","On January 7th, 2017, Facebook page ""Memelords...","""Real Emo"" only consists of the dc Emotional H..."
7,You Know Other Men?,Submission,,2021,Twitter,2023-03-08T16:55:04-05:00,590,1,15,0,"you're the most jealous man i know, you're the...",You Know Other Men? or You're the Most Jealous...,"On October 18th, 2021, Twitter[1] user @EIJICH...","The meme is captioned, ""You're the most jealou..."
8,Sneako's Cuck Story,Submission,Pop Culture Reference,2021,YouTube,2023-03-08T18:10:41-05:00,859,0,16,2,"sneako, pear to pear pdocast, cuck, cuckold, o...",Sneako's Cuck Story refers to a story of watch...,"On November 25th, episode 185 of Peer-Peer Pod...","I'm feeling traumatic thoughts, seeing her wit..."
9,Gotta Be One Of My Favorite Genders,Confirmed,Catchphrase,2021,Instagram,2021-06-09T13:02:01-04:00,67714,0,46,1,"i love women's day fr, i love women fr, one of...",Gotta Be One Of My Favorite Genders refers to ...,"On Monday, March 8th, 2021, International Wome...",According to an exchange of Twitter DMs with K...


__Полезные библиотеки__

In [123]:
import time
time.sleep(3) #ходить на сайт с перерывом в 3 секунды

In [None]:
import requests
requests.get('http://qooqle.com', timeout=1) #ждем ответа от сервера 1 секунду, если ответа нет - сбрасываем