# COVID-19 Memes, a Facebook album

This notebook downloads the top n most popular photos for each month.

Tip: [How to generate a long-lived Facebook token.](https://medium.com/@yasithlokuge/how-to-generate-a-never-expiring-facebook-page-access-token-24ac5c1a95f1)

Set n and download directory:

In [364]:
n = 11
download_dir = './memes/'
album_id = '10223159753881444'

Import dependencies and Facebook API token.

In [186]:
import configparser
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import json
import facebook
configs = configparser.ConfigParser()
configs.read('../config.ini')
token = configs['FACEBOOK']['ACCESS_TOKEN']

Set up API access and some variables.

In [188]:
graph = facebook.GraphAPI(access_token=token, version=3.1)
user_id = '10225913074352735'
love = 'reactions.type(LOVE).limit(0).summary(total_count).as(love)'
wow = 'reactions.type(WOW).limit(0).summary(total_count).as(wow)'
haha = 'reactions.type(HAHA).limit(0).summary(total_count).as(haha)'
like = 'reactions.type(LIKE).limit(0).summary(total_count).as(like)'
angry = 'reactions.type(ANGRY).limit(0).summary(total_count).as(angry)'
sad = 'reactions.type(SAD).limit(0).summary(total_count).as(sad)'

Get photos and videos.

This endpoint gives us the IDs and the photo or video source URL.

In [299]:
def get_photos():
    '''Gets some details of photos in the album.'''
    photos = []
    photo_url = "https://graph.facebook.com/10223159753881444?fields=photos{created_time,message,name,images}&access_token=" + token
    while photo_url: 
        response = requests.get(photo_url)
        response_parsed = json.loads(response.content)
        if 'photos' in response_parsed.keys():
            data = response_parsed['photos']['data']
            photo_url = response_parsed['photos']['paging']['next']
        else:
            data = response_parsed['data']
            if 'next' not in response_parsed['paging']:
                photos.extend(data)
                break
            photo_url = response_parsed['paging']['next']
        photos.extend(data)
    return photos

def get_videos():
    '''Gets details of videos in the album.'''
    videos = []
    videos_url = 'https://graph.facebook.com/me/videos/uploaded/?fields=privacy,format,video_insights&access_token=' + token
    while videos_url:
        data = None
        response = requests.get(videos_url)
        response_parsed = json.loads(response.content)
        if 'data' in response_parsed.keys():
            data = response_parsed['data']
            if 'next' not in response_parsed['paging']:
                videos.extend(data)
                break
            videos_url = response_parsed['paging']['next']
        videos.extend(data)
    #print('Number of videos collected:', len(videos))
    
    # Drop all videos that are not in the album. This is the best way I could find to do it.
    # It is not apparent how we can identify a video's album using the API. 
    # We also can't download videos from the API, only photos.
    browser = webdriver.Chrome(executable_path='../chromedriver')
    filtered_videos = videos.copy()
    for video in videos:
        url = 'https://www.facebook.com/1210377595/videos/a.10223159753881444/' + video['id']
        browser.get(url)
        time.sleep(1)
        has_video = bool(browser.find_elements(By.XPATH, "//div[contains(@class, 'k4urcfbm kr520xx4 pmk7jnqg datstx6m')]"))
        if not has_video:
            #print('Removing video...')
            filtered_videos.remove(video)
    #print('Removed', len(videos) - len(filtered_videos), 'videos that were not in the album.')
    return filtered_videos
    

photos = get_photos()
videos = get_videos()
print('Number of photos collected:', len(photos))
print('Number of videos collected:', len(videos))

Number of photos collected: 1218
Number of videos collected: 51


Query photos and videos again to get their reactions. The API is not well documented, so we need to make this query separately from the query to get the photo and video IDs and source URLs.

In [306]:
def get_reactions(items, reaction_query):
    reactions = {}
    ids = [user_id + '_' + x['id'] for x in items]
    for i in range(0, len(ids), 50):
        reactions.update(graph.get_objects(ids=ids[i:i+50], fields=reaction_query)) # only can query 50 at a time
    return reactions

video_reactions = 'created_time,message,shares,' + love + ',' + wow + ',' + haha + ',' + angry + ',' + sad + ',' + like
photo_reactions = 'created_time,shares,message,' + love + ',' + wow + ',' + haha + ',' + angry + ',' + sad + ',' + like
reactions = get_reactions(filtered_videos, video_reactions)
reactions.update(get_reactions(photos, photo_reactions))
print('We have reactions for', len(reactions), 'items.')

We have reactions for 1269 items.


Collect both videos and photos into a dataframe.

In [356]:
video_ids = [x['id'] for x in videos]
photo_ids = [x['id'] for x in photos]

data = {'id': [],
        'date': [],
        'loves': [],
        'angry': [],
        'wow': [],
        'likes': [],
        'sad': [],
        'haha': [],
        'shares': [],
        'type': [],
        'source': []}

for idx, item_id in enumerate(reactions):
    short_id = item_id.split('_')[1]
    data['id'].append(item_id)
    data['date'].append(reactions[item_id]['created_time'])
    data['loves'].append(reactions[item_id]['love']['summary']['total_count'])
    data['angry'].append(reactions[item_id]['angry']['summary']['total_count'])
    data['wow'].append(reactions[item_id]['wow']['summary']['total_count'])
    data['likes'].append(reactions[item_id]['like']['summary']['total_count'])
    data['sad'].append(reactions[item_id]['sad']['summary']['total_count'])
    data['haha'].append(reactions[item_id]['haha']['summary']['total_count'])
    if 'shares' in reactions[item_id]:
        data['shares'].append(reactions[item_id]['shares']['count'])
    else:
        data['shares'].append(0)
    if short_id in video_ids:
        data['type'].append('video')
        data['source'].append(videos[idx]['format'][-1]['embed_html'].split("\"")[1])
    elif short_id in photo_ids:
        data['type'].append('photo')
        data['source'].append(next((x for x in photos if x['id'] == short_id), None)['images'][0]['source'])

df = pd.DataFrame(data)
df

Unnamed: 0,id,date,loves,angry,wow,likes,sad,haha,shares,type,source
0,10225913074352735_10225925051372153,2020-12-27T01:34:38+0000,1,0,0,8,0,3,0,video,https://www.facebook.com/plugins/video.php?hre...
1,10225913074352735_10225924567800064,2020-12-27T00:21:22+0000,0,0,0,2,0,5,0,video,https://www.facebook.com/plugins/video.php?hre...
2,10225913074352735_10225924372835190,2020-12-26T23:52:42+0000,0,0,0,0,0,6,0,video,https://www.facebook.com/plugins/video.php?hre...
3,10225913074352735_10225879222026448,2020-12-21T09:53:05+0000,3,0,0,4,0,0,0,video,https://www.facebook.com/plugins/video.php?hre...
4,10225913074352735_10225855556314820,2020-12-18T10:30:05+0000,3,0,0,8,0,12,2,video,https://www.facebook.com/plugins/video.php?hre...
...,...,...,...,...,...,...,...,...,...,...,...
1264,10225913074352735_10225924299993369,2020-12-26T23:43:16+0000,0,0,0,3,0,5,0,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...
1265,10225913074352735_10225926826336526,2020-12-27T10:26:01+0000,0,0,0,2,0,5,0,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...
1266,10225913074352735_10225938230101613,2020-12-28T20:32:18+0000,0,0,0,3,0,9,0,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...
1267,10225913074352735_10225939521453896,2020-12-29T00:07:09+0000,0,0,0,3,0,13,0,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...


Remove photos that failed. In other words, their details are actually those of the album itself. Reason is unknown.

In [357]:
def detect_failed_items(reactions):
    '''Returns failed item ids. Some photos fail for some reason... Their IDs seem correct 
    but for some reason the API accesses the album itself instead. So we will detect them.'''
    failed_ids = []
    album_message = 'Coronavirus memes are spreading like, well, coronavirus. A collection of the better ones to help get us through social distancing. Additions welcome. Stay healthy!'
    for x in reactions:
        if 'message' in reactions[x].keys():
            if reactions[x]['message'] == album_message:
                failed_ids.append(x)
    return failed_ids
                
failed_ids = detect_failed_items(reactions)
orig = len(df)
df = df[~df['id'].isin(failed_ids)].reset_index(drop=True)
print('Dropped', orig - len(df), 'items.')

Dropped 12 items.


Get the top n memes for each month, based on number of likes, hahas, loves, and shares. 

In [366]:
df['total'] = df.apply(lambda x: x['loves'] + x['haha'] + x['likes'] + x['shares'], axis=1)
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
top_memes = df.sort_values(['month','total'], ascending=False).groupby('month', sort=False).head(n)
top_memes

Unnamed: 0,id,date,loves,angry,wow,likes,sad,haha,shares,type,source,total,month
7,10225913074352735_10225787702098507,2020-12-10 03:08:27+00:00,0,0,0,9,0,29,12,video,https://www.facebook.com/plugins/video.php?hre...,50,12
1207,10225913074352735_10225784275092834,2020-12-09 18:28:22+00:00,0,0,0,6,0,30,0,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,36,12
1212,10225913074352735_10225795830301707,2020-12-11 04:09:39+00:00,0,0,0,8,0,19,8,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,35,12
1203,10225913074352735_10225771448132168,2020-12-08 01:28:40+00:00,0,0,0,15,0,17,1,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,33,12
1245,10225913074352735_10225924110148623,2020-12-26 22:53:31+00:00,0,0,0,5,0,25,3,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,33,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,10225913074352735_10223227755301437,2020-03-22 20:31:03+00:00,0,0,0,5,0,17,6,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,28,3
305,10225913074352735_10223318596172402,2020-03-30 00:59:45+00:00,0,0,0,2,0,23,3,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,28,3
33,10225913074352735_10223243186207200,2020-03-24 00:43:51+00:00,0,0,0,6,0,13,8,video,https://www.facebook.com/plugins/video.php?hre...,27,3
54,10225913074352735_10223162376987020,2020-03-17 15:29:34+00:00,0,0,0,5,0,16,5,photo,https://scontent.feau1-1.fna.fbcdn.net/v/t1.0-...,26,3


Download all these memes. 

They will be named `[id]_month[num]_total[num]`. Not downloading videos because my goal is to upload these memes to Medium, and they only accept embeddable videos, so we'll print the video URLs.

In [367]:
def download(meme):
    meme_id = meme['id'].split('_')[1]
    print(meme_id, '...', 'month', meme['month'], '... total:', meme['total'], '... type:', meme['type'])
    if meme['type'] == 'video':
        print(meme['source'])
        return
    else:
        ext = '.png'
    filename = download_dir + meme_id + '_month' + str(meme['month']) + '_total' + str(meme['total']) + ext
    response = requests.get(meme['source'], allow_redirects=True)
    open(filename, 'wb').write(response.content)

top_memes.apply(download, axis=1)

10225787702098507 ... month 12 ... total: 50 ... type: video
https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fbriennakh%2Fvideos%2F10225787702098507%2F&width=576
10225784275092834 ... month 12 ... total: 36 ... type: photo
10225795830301707 ... month 12 ... total: 35 ... type: photo
10225771448132168 ... month 12 ... total: 33 ... type: photo
10225924110148623 ... month 12 ... total: 33 ... type: photo
10225784291533245 ... month 12 ... total: 32 ... type: photo
10225854522488975 ... month 12 ... total: 29 ... type: photo
10225761563605061 ... month 12 ... total: 28 ... type: photo
10225788176150358 ... month 12 ... total: 27 ... type: video
https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fbriennakh%2Fvideos%2F10225788176150358%2F&width=540
10225945639766850 ... month 12 ... total: 27 ... type: photo
10225827894823300 ... month 12 ... total: 26 ... type: photo
10225549357100031 ... month 11 ... total: 39 ... type: photo
10

7       None
1207    None
1212    None
1203    None
1245    None
        ... 
231     None
305     None
33      None
54      None
286     None
Length: 110, dtype: object

Old code, saving as a reminder that [we cannot scrape Facebook pages using BeautifulSoup](https://stackoverflow.com/questions/58689247/scrapping-facebook-likes-comments-and-shares-with-beautiful-soup). They use JavaScript to render the HTML. We need Selenium, which unfortunately goes against their robots.txt. 

In [225]:
from bs4 import BeautifulSoup
url = 'https://www.facebook.com/1210377595/videos/a.10223159753881444/10225925051372153'
response = BeautifulSoup(requests.get(url, time.sleep(10)).content, 'html.parser')
response.find_all('div')

[<div id="mount_0_0"></div>]