In [1]:
####################################################################################################
# Author: Alexander O. Smith
# Created: Oct 25, 2021
# Updated: Nov 8, 2021
####################################################################################################

In [2]:
####################################################################################################
# IMPORTS ##########################################################################################
from bs4 import BeautifulSoup as bs  #To Output Clean HTML files (and general scraping)
from lxml import html, etree         #For XPath
from io import StringIO, BytesIO
from urllib.request import Request, urlopen
import urllib.request
import pandas as pd
from datetime import date
from random import randint
from time import sleep
import os, re, random
####################################################################################################


In [3]:
####################################################################################################
# Version 1 ########################################################################################
def kym_gallery_pull(name='rickroll'):

    # Setup an output file...
    out = open(f'{name}_collection_output.txt',"a+")
    today = date.today()
    dateMDY = today.strftime("%m-%d-%y")
    date_time = today.strftime("%m/%d/%Y, %H:%M:%S")
    out.write(f'KYM_GALLERY_PULL Function Initiated: {date_time}...')
    print(f'KYM_GALLERY_PULL Function Initiated: {date_time}...')

    # Try/Except captures errors in the outfile: 'out'
    try:
        # Set up url connection with BeautifulSoup
        name = re.sub('\s', '-', str(name))
        out.write(f'Collecting image gallery "{name}" from KnowYourMeme.\n')
        #print(f'Collecting image gallery "{name}" from KnowYourMeme.')
        url = f'https://knowyourmeme.com/memes/{name}'
        memeURL = f'{url}/photos/page/1'
        req = Request(str(memeURL), headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = bs(webpage, 'html.parser', from_encoding="iso-8859-1")

        # Load webpage into ELement Tree format for XPath search for first page image count:
        root = etree.HTML(webpage)
        result = etree.tostring(root, pretty_print=True, method="html")

        n = 1 #begin iterable for each KYM Gallery Page URL
        url_suffixes = [] #being the list of url directorys after the url prefix for each image

        # While loop runs so long as the photo_gallery is not empty
        while len(root.xpath("//a[@rel='photo_gallery']")) != 0:

            # Set up iterable url connection
            out.write(f'Retrieving image hrefs from page {n} of {name} gallery...\n')
            #print(f'Retrieving image hrefs from page {n} of {name} gallery...')
            memeURL = f'{url}/photos/page/{n}'
            req = Request(memeURL, headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urlopen(req).read()
            root = etree.HTML(webpage)
            result = etree.tostring(root, pretty_print=True, method="html")

            url_suffixes = url_suffixes + root.xpath(f'//div/div[@id="photo_gallery"]/div[@class="item"]/a/@href')

            # Iterate to the next n to get the next KYM Gallery Page in 'memeURL' above
            n+=1

        out.write(f'Completed gallery href collection for {name} gallery.\n')
        #print(f'Completed gallery href collection for {name} gallery.')

        # Debugging to check to make sure we got all the images in the gallery
        total_url = len(url_suffixes)
        unique_url = len(pd.unique(url_suffixes))
        if total_url != unique_url:
            out.write(f'''COLLECTION WARNING: Difference between the total URLs - unique URLs is non-zero.\n
                This indicates a loss of {total_url - unique_url} gallery images from KYM.\n
                The total number of unique URLs is {unique_url}.\n''')
            #print(f'''COLLECTION WARNING: Difference between the total URLs - unique URLs is non-zero.\n
            #      This indicates a loss of {total_url - unique_url} gallery images from KYM.\n
            #      The total number of unique URLs is {unique_url}.''')
        else:
            out.write(f'The total number of URLs is {unique_url}.\n')
            #print(f'The total number of URLs is {unique_url}.')

        # Building lists for a metadata dataframe
        lrg_img = []          # large img URLs
        img_IDs = []          # img_IDs
        img_locs = []         # locations of image
        alt_txt = []          # image alt text
        date_upload = []      # image upload date
        views = []            # number of views of each image
        views_today = []      # number of views today for each image
        img_source = []       # given source of image
        pos_votes = []        # positive votes on an image
        neg_votes = []        # negative votes on an image
        kym_tags = []         # tags offered by KYM
        img_edits_url = []    # url extension to image metadata edits
        author = []           # author handle assigned to image

        # Loop saves images and metadata for each image.
        for u in url_suffixes:
            out.write(f'Retrieving img at href {u}\n')
            print(f'Retrieving img at href {u}')
            imgURL = f'{url}{u}'
            out.write(imgURL+'\n')
            print(imgURL)
            reqIMG = Request(str(imgURL), headers={'User-Agent': 'Mozilla/5.0'})
            imgpage = urlopen(reqIMG).read()
            soup = bs(imgpage, 'html.parser') #, from_encoding="iso-8859-1"
            # Load webpage into ELement Tree format for XPath search for first page image count:
            imgRoot = etree.HTML(imgpage)
            # Putting all final image URL locations into 'lrg_img' list.
            img = imgRoot.xpath("*//div[@class='thumbable']//img[contains(@data-src, 'newsfeed')]//@data-src")[0]
            lrg_img.append(img)

            # Using 'img' urls to create a unique ID
            i = img.partition("newsfeed/")[2]
            img_file = re.sub('/', '_',i)
            out.write(f'Retrieving Image {img_file}\n')
            print(f'Retrieving Image {img_file}')
            img_ID = img_file.partition('.')[0]
            # Should be a unique image ID
            img_ID = f'IMG_{img_ID}'
            img_IDs.append(img_ID)

            # Creating an image location name, and making a metadata variable for it
            img_loc = f'{str(name)}-meme'
            img_locs.append(img_loc)

            # KYM image metadata imgRoot location, under "Image Details"
            # For explanation of meanings of the appended metadata, see empty lists defined before loop.
            view = imgRoot.xpath(
              "//aside[@id='sidebar']//span[@class='view_count']/text()")[0]
            views.append(view)
            view_today = imgRoot.xpath(
              "//aside[@id='sidebar']//span[@class='today_view_count']/text()")[0][1:-1] 
            view_today = re.sub('\sfrom\stoday', '', view_today)
            views_today.append(view_today)
            upload = imgRoot.xpath(
              "//aside[@id='sidebar']//abbr[@class='timeago']/@title")[0]
            date_upload.append(upload)
            try:
                source = imgRoot.xpath(
                  "//aside[@id='sidebar']//div//p/a[@href]/@href")[1]
            except IndexError:
                source = 'No Source Given'
            img_source.append(source)
            thumbs_alt = imgRoot.xpath(
              "//div[@class='thumbable']//div[@class='thumb_mini_container']//span/@alt")[0]
            thumbs_alt = thumbs_alt.split(', ')
            thumbs_alt[0] = re.sub('\sup', '',thumbs_alt[0])
            thumbs_alt[1] = re.sub('\sdown', '',thumbs_alt[1])
            pos_votes.append(thumbs_alt[0])
            neg_votes.append(thumbs_alt[1])
            tag = imgRoot.xpath(
              "//aside[@id='sidebar']//div//p[@id='tag_list']//@data-tag")
            delim = '|'
            tags = ''
            for t in tag:
                tags = tags + str(t) + delim 
            kym_tags.append(tags)
            edits =str(
              'https://knowyourmeme.com'
              +imgRoot.xpath("//aside[@id='sidebar']//div[@class='tc']//a[contains(@href, 'edits')]/@href")[0])
            img_edits_url.append(edits)
            alt =  imgRoot.xpath("*//div[@class='thumbable']//img[@alt]/@alt")[0]
            alt_txt.append(alt)

            author_tag = imgRoot.xpath("//div[@id='author_info']//div//h6/a/text()")
            author.append(author_tag)

            # Output Image Files To Created Meme Directory
            os.makedirs(f'./GCV_FuncTestData/{img_loc}', exist_ok=True)
            urllib.request.urlretrieve(img, f'./GCV_FuncTestData/{img_loc}/{img_file}')

            # Build dataframe of metadata
            # Jeff says "short columns sooner, long columns later... with the more important columns prioritized sooner."
        metadata_df = pd.DataFrame({
            'img_ID': img_IDs, 
            'meme_name': img_locs,
            'date_upload': date_upload,
            'kym_tags': kym_tags,
            'pos_votes': pos_votes,
            'neg_votes': neg_votes,
            'img_source': img_source,
            'views': views,
            'views_today': views_today,
            'img_edits_url': img_edits_url,
            'author': author,
            'alt_txt': alt_txt,
            'img_url': url_suffixes,
            'lrg_img_url': lrg_img
        })

        # Output a metadata csv with csv name containing the date captured.
        metadata_df.to_csv(
            f'./GCV_FuncTestData/{str(name)}-meme/{name}_KYMmetadata_{dateMDY}.csv'
          , index=False)
    except:
        traceback.print_exc(file=out)
    out.write('Metadata file completed.\n')
    #print('Metadata file completed.')
    out.write('KYM_GALLERY_PULL Function Completed.\n\n')
    #print('KYM_GALLERY_PULL Function Completed.\n\n')
    # Close Output File
    out.close()

In [5]:
kym_gallery_pull('loss')

KYM_GALLERY_PULL Function Initiated: 12/03/2021, 00:00:00...
Retrieving img at href /photos/2247003-loss
https://knowyourmeme.com/memes/loss/photos/2247003-loss
Retrieving Image 002_247_003_774.png
Retrieving img at href /photos/2231456-loss
https://knowyourmeme.com/memes/loss/photos/2231456-loss
Retrieving Image 002_231_456_f6d.jpg
Retrieving img at href /photos/2228646-loss
https://knowyourmeme.com/memes/loss/photos/2228646-loss
Retrieving Image 002_228_646_313.jpg
Retrieving img at href /photos/2212542-loss
https://knowyourmeme.com/memes/loss/photos/2212542-loss
Retrieving Image 002_212_542_769.jpg
Retrieving img at href /photos/2146625-loss
https://knowyourmeme.com/memes/loss/photos/2146625-loss
Retrieving Image 002_146_625_a13.png
Retrieving img at href /photos/2111385-loss
https://knowyourmeme.com/memes/loss/photos/2111385-loss
Retrieving Image 002_111_385_d6e.jpg
Retrieving img at href /photos/2090511-loss
https://knowyourmeme.com/memes/loss/photos/2090511-loss
Retrieving Image 

Retrieving Image 001_550_449_2f2
Retrieving img at href /photos/1549524-loss
https://knowyourmeme.com/memes/loss/photos/1549524-loss
Retrieving Image 001_549_524_3f8.jpg
Retrieving img at href /photos/1518836-loss
https://knowyourmeme.com/memes/loss/photos/1518836-loss
Retrieving Image 001_518_836_5e0.pnj
Retrieving img at href /photos/1513877-loss
https://knowyourmeme.com/memes/loss/photos/1513877-loss
Retrieving Image 001_513_877_3fb.jpg
Retrieving img at href /photos/1513304-loss
https://knowyourmeme.com/memes/loss/photos/1513304-loss
Retrieving Image 001_513_304_0ff.jpg
Retrieving img at href /photos/1511635-loss
https://knowyourmeme.com/memes/loss/photos/1511635-loss
Retrieving Image 001_511_635_694.pnj
Retrieving img at href /photos/1511121-loss
https://knowyourmeme.com/memes/loss/photos/1511121-loss
Retrieving Image 001_511_121_694.png
Retrieving img at href /photos/1511120-loss
https://knowyourmeme.com/memes/loss/photos/1511120-loss
Retrieving Image 001_511_120_5d5.png
Retrievi

Retrieving Image 001_428_638_e36.png
Retrieving img at href /photos/1425436-loss
https://knowyourmeme.com/memes/loss/photos/1425436-loss
Retrieving Image 001_425_436_a0e.png
Retrieving img at href /photos/1425199-loss
https://knowyourmeme.com/memes/loss/photos/1425199-loss
Retrieving Image 001_425_199_e10.png
Retrieving img at href /photos/1424983-loss
https://knowyourmeme.com/memes/loss/photos/1424983-loss
Retrieving Image 001_424_983_c8d.png
Retrieving img at href /photos/1422835-loss
https://knowyourmeme.com/memes/loss/photos/1422835-loss
Retrieving Image 001_422_835_9b7.png
Retrieving img at href /photos/1421925-loss
https://knowyourmeme.com/memes/loss/photos/1421925-loss
Retrieving Image 001_421_925_a82.png
Retrieving img at href /photos/1421710-loss
https://knowyourmeme.com/memes/loss/photos/1421710-loss
Retrieving Image 001_421_710_205.jpg
Retrieving img at href /photos/1417181-loss
https://knowyourmeme.com/memes/loss/photos/1417181-loss
Retrieving Image 001_417_181_faa.jpeg
Ret

Retrieving Image 001_391_717_fd4.png
Retrieving img at href /photos/1391587-loss
https://knowyourmeme.com/memes/loss/photos/1391587-loss
Retrieving Image 001_391_587_292.jpg
Retrieving img at href /photos/1391363-loss
https://knowyourmeme.com/memes/loss/photos/1391363-loss
Retrieving Image 001_391_363_fb3.jpg
Retrieving img at href /photos/1391254-loss
https://knowyourmeme.com/memes/loss/photos/1391254-loss
Retrieving Image 001_391_254_6e5.jpg
Retrieving img at href /photos/1391115-loss
https://knowyourmeme.com/memes/loss/photos/1391115-loss
Retrieving Image 001_391_115_832.png
Retrieving img at href /photos/1390756-loss
https://knowyourmeme.com/memes/loss/photos/1390756-loss
Retrieving Image 001_390_756_bc5.jpg
Retrieving img at href /photos/1390571-loss
https://knowyourmeme.com/memes/loss/photos/1390571-loss
Retrieving Image 001_390_571_876.jpg
Retrieving img at href /photos/1390458-loss
https://knowyourmeme.com/memes/loss/photos/1390458-loss
Retrieving Image 001_390_458_5c4.png
Retr

Retrieving Image 001_383_523_8fb.jpg
Retrieving img at href /photos/1383509-loss
https://knowyourmeme.com/memes/loss/photos/1383509-loss
Retrieving Image 001_383_509_fa9.jpg
Retrieving img at href /photos/1383505-loss
https://knowyourmeme.com/memes/loss/photos/1383505-loss
Retrieving Image 001_383_505_184.jpg
Retrieving img at href /photos/1383493-loss
https://knowyourmeme.com/memes/loss/photos/1383493-loss
Retrieving Image 001_383_493_7b3.png
Retrieving img at href /photos/1383431-loss
https://knowyourmeme.com/memes/loss/photos/1383431-loss
Retrieving Image 001_383_431_11b.png
Retrieving img at href /photos/1383382-loss
https://knowyourmeme.com/memes/loss/photos/1383382-loss
Retrieving Image 001_383_382_fe5.png
Retrieving img at href /photos/1383380-loss
https://knowyourmeme.com/memes/loss/photos/1383380-loss
Retrieving Image 001_383_380_67b.jpg
Retrieving img at href /photos/1383301-loss
https://knowyourmeme.com/memes/loss/photos/1383301-loss
Retrieving Image 001_383_301_c37.png
Retr

Retrieving Image 001_378_316_4a8.png
Retrieving img at href /photos/1378306-loss
https://knowyourmeme.com/memes/loss/photos/1378306-loss
Retrieving Image 001_378_306_ed1.png
Retrieving img at href /photos/1378275-loss
https://knowyourmeme.com/memes/loss/photos/1378275-loss
Retrieving Image 001_378_275_9eb.jpg
Retrieving img at href /photos/1378274-loss
https://knowyourmeme.com/memes/loss/photos/1378274-loss
Retrieving Image 001_378_274_9c4.jpeg
Retrieving img at href /photos/1378213-loss
https://knowyourmeme.com/memes/loss/photos/1378213-loss
Retrieving Image 001_378_213_814.jpg
Retrieving img at href /photos/1378200-loss
https://knowyourmeme.com/memes/loss/photos/1378200-loss
Retrieving Image 001_378_200_1d1.png
Retrieving img at href /photos/1378172-loss
https://knowyourmeme.com/memes/loss/photos/1378172-loss
Retrieving Image 001_378_172_f4b.jpg
Retrieving img at href /photos/1377968-loss
https://knowyourmeme.com/memes/loss/photos/1377968-loss
Retrieving Image 001_377_968_00f.png
Ret

Retrieving Image 001_369_780_296.jpg
Retrieving img at href /photos/1369471-loss
https://knowyourmeme.com/memes/loss/photos/1369471-loss
Retrieving Image 001_369_471_cf3.png
Retrieving img at href /photos/1369463-loss
https://knowyourmeme.com/memes/loss/photos/1369463-loss
Retrieving Image 001_369_463_3c4.jpg
Retrieving img at href /photos/1368847-loss
https://knowyourmeme.com/memes/loss/photos/1368847-loss
Retrieving Image 001_368_847_0b0.png
Retrieving img at href /photos/1368740-loss
https://knowyourmeme.com/memes/loss/photos/1368740-loss
Retrieving Image 001_368_740_a60.jpg
Retrieving img at href /photos/1368694-loss
https://knowyourmeme.com/memes/loss/photos/1368694-loss
Retrieving Image 001_368_694_711.png
Retrieving img at href /photos/1368576-loss
https://knowyourmeme.com/memes/loss/photos/1368576-loss
Retrieving Image 001_368_576_09b.jpeg
Retrieving img at href /photos/1368486-loss
https://knowyourmeme.com/memes/loss/photos/1368486-loss
Retrieving Image 001_368_486_d07.jpg
Ret

Retrieving Image 001_362_049_c66.png
Retrieving img at href /photos/1361667-loss
https://knowyourmeme.com/memes/loss/photos/1361667-loss
Retrieving Image 001_361_667_d8d.png
Retrieving img at href /photos/1361649-loss
https://knowyourmeme.com/memes/loss/photos/1361649-loss
Retrieving Image 001_361_649_d86.jpg
Retrieving img at href /photos/1361460-loss
https://knowyourmeme.com/memes/loss/photos/1361460-loss
Retrieving Image 001_361_460_60f.png
Retrieving img at href /photos/1361419-loss
https://knowyourmeme.com/memes/loss/photos/1361419-loss
Retrieving Image 001_361_419_349.jpg
Retrieving img at href /photos/1361103-loss
https://knowyourmeme.com/memes/loss/photos/1361103-loss
Retrieving Image 001_361_103_000.gif
Retrieving img at href /photos/1361054-loss
https://knowyourmeme.com/memes/loss/photos/1361054-loss
Retrieving Image 001_361_054_4ac.jpg
Retrieving img at href /photos/1360990-loss
https://knowyourmeme.com/memes/loss/photos/1360990-loss
Retrieving Image 001_360_990_3fd.png
Retr

Retrieving Image 001_354_645_2cc.jpg
Retrieving img at href /photos/1354642-loss
https://knowyourmeme.com/memes/loss/photos/1354642-loss
Retrieving Image 001_354_642_fb2.jpg
Retrieving img at href /photos/1354533-loss
https://knowyourmeme.com/memes/loss/photos/1354533-loss
Retrieving Image 001_354_533_345.jpg_large
Retrieving img at href /photos/1354214-loss
https://knowyourmeme.com/memes/loss/photos/1354214-loss
Retrieving Image 001_354_214_728.jpg
Retrieving img at href /photos/1354165-loss
https://knowyourmeme.com/memes/loss/photos/1354165-loss
Retrieving Image 001_354_165_e0c.jpg
Retrieving img at href /photos/1354061-loss
https://knowyourmeme.com/memes/loss/photos/1354061-loss
Retrieving Image 001_354_061_84b.png
Retrieving img at href /photos/1353891-loss
https://knowyourmeme.com/memes/loss/photos/1353891-loss
Retrieving Image 001_353_891_5d1.png
Retrieving img at href /photos/1353482-loss
https://knowyourmeme.com/memes/loss/photos/1353482-loss
Retrieving Image 001_353_482_4eb.jp

Retrieving Image 001_339_336_a96.png
Retrieving img at href /photos/1339225-loss
https://knowyourmeme.com/memes/loss/photos/1339225-loss
Retrieving Image 001_339_225_7c4.jpg
Retrieving img at href /photos/1338866-loss
https://knowyourmeme.com/memes/loss/photos/1338866-loss
Retrieving Image 001_338_866_38b.png
Retrieving img at href /photos/1336877-loss
https://knowyourmeme.com/memes/loss/photos/1336877-loss
Retrieving Image 001_336_877_010.png
Retrieving img at href /photos/1336591-loss
https://knowyourmeme.com/memes/loss/photos/1336591-loss
Retrieving Image 001_336_591_f35.jpg
Retrieving img at href /photos/1336291-loss
https://knowyourmeme.com/memes/loss/photos/1336291-loss
Retrieving Image 001_336_291_ff3.jpg
Retrieving img at href /photos/1335462-loss
https://knowyourmeme.com/memes/loss/photos/1335462-loss
Retrieving Image 001_335_462_850.jpeg
Retrieving img at href /photos/1335379-loss
https://knowyourmeme.com/memes/loss/photos/1335379-loss
Retrieving Image 001_335_379_859.jpg
Ret

Retrieving Image 001_313_334_182.jpg
Retrieving img at href /photos/1313163-loss
https://knowyourmeme.com/memes/loss/photos/1313163-loss
Retrieving Image 001_313_163_549.png
Retrieving img at href /photos/1313161-loss
https://knowyourmeme.com/memes/loss/photos/1313161-loss
Retrieving Image 001_313_161_79e.png
Retrieving img at href /photos/1313160-loss
https://knowyourmeme.com/memes/loss/photos/1313160-loss
Retrieving Image 001_313_160_f69.png
Retrieving img at href /photos/1312795-loss
https://knowyourmeme.com/memes/loss/photos/1312795-loss
Retrieving Image 001_312_795_da9.png
Retrieving img at href /photos/1312598-loss
https://knowyourmeme.com/memes/loss/photos/1312598-loss
Retrieving Image 001_312_598_5dd.png
Retrieving img at href /photos/1312502-loss
https://knowyourmeme.com/memes/loss/photos/1312502-loss
Retrieving Image 001_312_502_2ef.png
Retrieving img at href /photos/1310942-loss
https://knowyourmeme.com/memes/loss/photos/1310942-loss
Retrieving Image 001_310_942_97a.png
Retr

Retrieving Image 001_291_449_997.jpg
Retrieving img at href /photos/1291348-loss
https://knowyourmeme.com/memes/loss/photos/1291348-loss
Retrieving Image 001_291_348_772.png
Retrieving img at href /photos/1291339-loss
https://knowyourmeme.com/memes/loss/photos/1291339-loss
Retrieving Image 001_291_339_192.png
Retrieving img at href /photos/1291338-loss
https://knowyourmeme.com/memes/loss/photos/1291338-loss
Retrieving Image 001_291_338_656.jpg
Retrieving img at href /photos/1291293-loss
https://knowyourmeme.com/memes/loss/photos/1291293-loss
Retrieving Image 001_291_293_590.jpg
Retrieving img at href /photos/1291233-loss
https://knowyourmeme.com/memes/loss/photos/1291233-loss
Retrieving Image 001_291_233_d6f.jpg
Retrieving img at href /photos/1291193-loss
https://knowyourmeme.com/memes/loss/photos/1291193-loss
Retrieving Image 001_291_193_2d5.png
Retrieving img at href /photos/1290998-loss
https://knowyourmeme.com/memes/loss/photos/1290998-loss
Retrieving Image 001_290_998_451.png
Retr

Retrieving Image 001_255_023_96b.png
Retrieving img at href /photos/1250663-loss
https://knowyourmeme.com/memes/loss/photos/1250663-loss
Retrieving Image 001_250_663_042.png
Retrieving img at href /photos/1250633-loss
https://knowyourmeme.com/memes/loss/photos/1250633-loss
Retrieving Image 001_250_633_75b.jpg
Retrieving img at href /photos/1250470-loss
https://knowyourmeme.com/memes/loss/photos/1250470-loss
Retrieving Image 001_250_470_509.png
Retrieving img at href /photos/1250266-loss
https://knowyourmeme.com/memes/loss/photos/1250266-loss
Retrieving Image 001_250_266_0b8.jpg
Retrieving img at href /photos/1250086-loss
https://knowyourmeme.com/memes/loss/photos/1250086-loss
Retrieving Image 001_250_086_d7e.jpg
Retrieving img at href /photos/1250049-loss
https://knowyourmeme.com/memes/loss/photos/1250049-loss
Retrieving Image 001_250_049_c79.jpg
Retrieving img at href /photos/1249321-loss
https://knowyourmeme.com/memes/loss/photos/1249321-loss
Retrieving Image 001_249_321_1af.jpg
Retr

Retrieving Image 001_204_511_863.jpg
Retrieving img at href /photos/1202763-loss
https://knowyourmeme.com/memes/loss/photos/1202763-loss
Retrieving Image 001_202_763_24c.png
Retrieving img at href /photos/1201895-loss
https://knowyourmeme.com/memes/loss/photos/1201895-loss
Retrieving Image 001_201_895_2cc.jpg
Retrieving img at href /photos/1198704-loss
https://knowyourmeme.com/memes/loss/photos/1198704-loss
Retrieving Image 001_198_704_562.png
Retrieving img at href /photos/1198361-loss
https://knowyourmeme.com/memes/loss/photos/1198361-loss
Retrieving Image 001_198_361_f4c.jpg
Retrieving img at href /photos/1198107-loss
https://knowyourmeme.com/memes/loss/photos/1198107-loss
Retrieving Image 001_198_107_05d.png
Retrieving img at href /photos/1197639-loss
https://knowyourmeme.com/memes/loss/photos/1197639-loss
Retrieving Image 001_197_639_945.png
Retrieving img at href /photos/1197052-loss
https://knowyourmeme.com/memes/loss/photos/1197052-loss
Retrieving Image 001_197_052_7ae.jpg
Retr

Retrieving Image 001_172_421_2e6.png
Retrieving img at href /photos/1171931-loss
https://knowyourmeme.com/memes/loss/photos/1171931-loss
Retrieving Image 001_171_931_7dc.jpg
Retrieving img at href /photos/1171123-loss
https://knowyourmeme.com/memes/loss/photos/1171123-loss
Retrieving Image 001_171_123_798.png
Retrieving img at href /photos/1170223-loss
https://knowyourmeme.com/memes/loss/photos/1170223-loss
Retrieving Image 001_170_223_da8.png
Retrieving img at href /photos/1168663-loss
https://knowyourmeme.com/memes/loss/photos/1168663-loss
Retrieving Image 001_168_663_a94.png
Retrieving img at href /photos/1168387-loss
https://knowyourmeme.com/memes/loss/photos/1168387-loss
Retrieving Image 001_168_387_331.png
Retrieving img at href /photos/1168384-loss
https://knowyourmeme.com/memes/loss/photos/1168384-loss
Retrieving Image 001_168_384_b8f.png
Retrieving img at href /photos/1168192-loss
https://knowyourmeme.com/memes/loss/photos/1168192-loss
Retrieving Image 001_168_192_97e.png
Retr

NameError: name 'traceback' is not defined

In [32]:
###############################################################################################################
# Meme Library Scrape: Version 1 ##############################################################################
###############################################################################################################
# TO DO:
# 1. Figure out how to get proxy ips that aren't 1xx or 4xx errors
# 2. Call this function within an exception in the above function
# 3. Create outfile statements
# 4. Give better comments
#def meme_entry_db():
n = 1

def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    proxy_pg = urlopen(response).read()
    proxy_root = etree.HTML(proxy_pg)
    result = etree.tostring(proxy_root, pretty_print=True, method="html")
    #bs(proxy_pg, 'html.parser', from_encoding="iso-8859-1")
    proxy_xpth = "//*[@id='list']/div/div[2]/div/table/tbody/tr/td[1]/text()"
    ips = proxy_root.xpath(proxy_xpth)
    return ips 

ips = get_proxies()
#print(ips)
#print(f'Retrieving meme hrefs from page {n} of All Memes on KYM...')

In [37]:
cols = ['meme_entry_ID','meme_names','meme_sufs', 'meme_gallery_ID', 'meme_gallery','meme_labels']
meme_entry_df = pd.DataFrame(columns=cols)
url_pg = 'https://knowyourmeme.com/memes/all'
req_pg = Request(url_pg, headers={'User-Agent': 'Mozilla/5.0'})
ip = random.choice(tuple(ips))
print(ip)
req_pg.set_proxy(ip, 'http')
print(req_pg)
webpage_pg = urlopen(req_pg).read()
root_pg = etree.HTML(webpage_pg)
while root_pg.xpath("//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]") != 0:
    # Connecting to Meme Entry's Page N
    row = []
    print(f'Retrieving meme hrefs from page {n} of All Memes on KYM...')
    req_pg = Request(url_pg, headers={'User-Agent': 'Mozilla/5.0'})
    req_pg.set_proxy(random.choice(tuple(ips)), 'http')

    webpage_pg = urlopen(req_pg).read()
    root_pg = etree.HTML(webpage_pg)
    result = etree.tostring(root_pg, pretty_print=True, method="html")
    xpth = "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]/a[@href]//@href"
    meme_sufs = root_pg.xpath(xpth)
    xpth2 = "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]/a/img[@title]/@title"
    meme_names = root_pg.xpath(xpth2)
    if len(meme_sufs) == 0:
        print('There are no more memes to scrape./nENDING SCRIPT')
        break

    xpth2 = "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]/@class"
    entry = root_pg.xpath(xpth2)
    meme_labels = []
    for id in entry:
        labs = f"//section[@id='entries']//table[@class='entry_list']//td[@class='{id}']//span/text()"
        lab = root_pg.xpath(labs)
        meme_labels.append(lab)
    meme_urls = []
    meme_IDurls = []
    for i in range(0,len(entry)):
        meme_url = 'https://knowyourmeme.com' + meme_sufs[i] + '/photos/'
        entry_id = [re.sub('entry_', '', i) for i in entry]
        #print(entry_id)
        meme_entryID_url = 'https://knowyourmeme.com/memes/' + entry_id[i] + '/photos/'
        meme_urls.append(meme_url)
        meme_IDurls.append(meme_entryID_url)

    # Building row to append to meme_entry_df
    for i in range(0,len(entry)):
        #Append row in this order
        row = ([entry[i],meme_names[i], meme_sufs[i], meme_IDurls[i],  meme_urls[i], meme_labels[i]])
        df = pd.DataFrame([row], columns=cols)
        #print(df2)
        meme_entry_df = meme_entry_df.append(df, ignore_index=True)

    # Sleep statement to avoid 403 from KYM    
    sleep(randint(1,60))

    n+=1
    url = 'https://knowyourmeme.com/memes/all'
    url_pg = f'{url}/page/{n}'
    url_len = len(root_pg.xpath(
        "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]"))

# From the list of meme_names create the url to each meme submission.

metadata_df.to_csv(
    f'./GCV_FuncTestData/{name}_KYMEntry_{dateMDY}.csv'
    ,index=False)

print(f'Retreived all meme galleries.\n')

#    return meme_entry_df

202.75.97.82
<urllib.request.Request object at 0x7f5b4987dd90>


URLError: <urlopen error [Errno 111] Connection refused>

In [35]:
# Metadata Test Script
url = 'https://knowyourmeme.com/memes/all/'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = bs(webpage, 'html.parser', from_encoding="iso-8859-1")

# Load webpage into ELement Tree format for XPath search for first page image count:
root = etree.HTML(webpage)
result = etree.tostring(root, pretty_print=True, method="html")

n = 1 #begin iterable for each KYM Gallery Page URL
url_suffixes = [] #being the list of url directorys after the url prefix for each image

# While loop runs so long as the photo_gallery is not empty
url_len = len(root.xpath("//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]//a[@href]//@href"))

meme_sufs = []
meme_names = []

url_pg = url
# While loop runs so long as the photo_gallery is not empty
while  url_len != 0:
    print(f'Retrieving meme hrefs from page {n} of All Memes on KYM...')
    req_pg = Request(url_pg, headers={'User-Agent': 'Mozilla/5.0'})
    webpg = urlopen(req_pg).read()
    root_pg = etree.HTML(webpg)
    result = etree.tostring(root_pg, pretty_print=True, method="html")
    xpth = "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]/a[@href]//@href"
    meme_sufs = meme_sufs + root_pg.xpath(xpth)
    xpth2 = "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]/a/img[@title]/@title"
    meme_names = meme_names + root_pg.xpath(xpth2)

    #print(meme_sufs)
    # Iterate to the next n to get the next KYM Gallery Page in 'memeURL' above
    n+=1
    url_pg = f'{url}/page/{n}'
    url_len = len(root_pg.xpath(
        "//section[@id='entries']//table[@class='entry_list']//td[contains(@class, 'entry_')]"))

print(f'Retreived all meme galleries.\n')

Retrieving meme hrefs from page 1 of All Memes on KYM...
Retrieving meme hrefs from page 2 of All Memes on KYM...
Retrieving meme hrefs from page 3 of All Memes on KYM...
Retrieving meme hrefs from page 4 of All Memes on KYM...
Retrieving meme hrefs from page 5 of All Memes on KYM...
Retrieving meme hrefs from page 6 of All Memes on KYM...
Retrieving meme hrefs from page 7 of All Memes on KYM...
Retrieving meme hrefs from page 8 of All Memes on KYM...
Retrieving meme hrefs from page 9 of All Memes on KYM...
Retrieving meme hrefs from page 10 of All Memes on KYM...
Retrieving meme hrefs from page 11 of All Memes on KYM...
Retrieving meme hrefs from page 12 of All Memes on KYM...
Retrieving meme hrefs from page 13 of All Memes on KYM...
Retrieving meme hrefs from page 14 of All Memes on KYM...
Retrieving meme hrefs from page 15 of All Memes on KYM...
Retrieving meme hrefs from page 16 of All Memes on KYM...
Retrieving meme hrefs from page 17 of All Memes on KYM...
Retrieving meme hrefs f

KeyboardInterrupt: 

In [5]:
#kym_md_test()

['118.201.86.149', '89.20.48.118', '213.230.69.193', '103.156.216.170', '14.177.236.212', '191.103.219.225', '160.16.149.58', '209.141.55.228', '206.253.164.122', '68.183.59.38', '20.81.62.32', '41.76.155.26', '139.99.237.62', '51.195.201.93', '199.19.226.12', '58.26.138.170', '103.216.103.25', '217.11.184.30', '154.16.63.16', '43.228.180.60', '54.219.16.162', '206.253.164.198', '179.96.28.58', '176.9.75.42', '46.4.96.137', '206.253.164.28', '68.183.56.232', '81.163.52.25', '191.96.42.80', '194.224.192.226', '162.241.76.185', '187.60.163.234', '43.224.10.27', '34.125.131.164', '18.218.101.232', '206.253.164.120', '206.253.164.110', '206.253.164.101', '88.198.50.103', '005.252.161.48', '139.162.78.109', '187.111.160.8', '199.19.225.250', '121.78.139.75', '199.19.224.3', '121.78.139.77', '201.55.170.221', '3.25.85.198', '176.9.119.170', '103.206.254.170', '212.50.36.245', '46.28.109.236', '178.124.152.21', '80.17.254.230', '37.187.146.176', '58.234.116.197', '64.17.30.238', '34.133.138.1