# **Gathering Data**

In this notebook I will use HTTP requests and beautiful soup to get informations about the random selected memes from KnowYourMeme.com (Origin date and platform). Then I will use pytrends (i.e. Google Trends python library) to get data related to the interest to each meme. Lastly, every information will be included in a single csv file in which rows are memes and columns are origin year, origin platform and days of staying trend.

In [1]:
!pip install requests==2.25.1



In [2]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
from urllib.parse import urljoin


#This part creates a module for inserting numbers in url for different pages in chronological order.
#Example is given below
#https://knowyourmeme.com/memes/page/{page_number}?kind=all&sort=oldest

base_url="https://knowyourmeme.com/memes?kind=all&sort=oldest&page="
maximum_page_number=1

memes_urls=[]

for page_num in range(1,maximum_page_number+1):

  #This loop gets each meme's name in each page to create a specific url to the KnowYourMeme page of every meme

  current_url=base_url+str(page_num)
  print(current_url)

  res=requests.get(current_url)
  soup=BeautifulSoup(res.content,'html.parser')

  for a_tag in soup.select('a.item[href*="/memes/"]'):
    relative_url = a_tag.get('href')
    title=a_tag.get('data-title')
    if relative_url and title:
      full_url = urljoin("https://knowyourmeme.com", relative_url)
      memes_urls.append(full_url)

  time.sleep(random.uniform(2, 4))


print(len(memes_urls))

https://knowyourmeme.com/memes?kind=all&sort=oldest&page=1
16


In [3]:
memes_urls[4]

'https://knowyourmeme.com/memes/chocolate-rain'

In [16]:
from IPython.utils import text

#Getting data from pages of each meme page
#Each Url looks like "https://knowyourmeme.com/memes/chocolate-rain"

memesDict={}

for url in memes_urls:
  res=requests.get(url)
  soup=BeautifulSoup(res.content,'html.parser')

  #Code to get the origin year of the meme is given below

  origin_year=0

  year_data=soup.find('dt',string=re.compile(r'Year'))
  if year_data:
    year_dd=year_data.find_next_sibling('dd')
    if year_dd:
      year_link=year_dd.find('a')
      if year_link:
        origin_year=year_link.text.strip()

  #Code to get the origin year of the meme is given below

  origin_platfrom=""

  origin_dt=soup.find('dt',string=re.compile(r'Origin'))
  if origin_dt:
    platfrom_dd=origin_dt.find_next_sibling('dd')
    if platfrom_dd:
      platfrom_link=platfrom_dd.find('a')
      if platfrom_link:
        origin_platfrom=platfrom_link.text.strip()

  #Creating a dictionary with the values we have scraped so that we can create csv easily

  name=url.split('/')[-1].replace("-"," ")

  memesDict[name]={
      'origin_year':origin_year,
      'origin_platform':origin_platfrom}

print(memesDict)

{'pedobear': {'origin_year': '2003', 'origin_platform': '4chan'}, 'rickroll': {'origin_year': '2006', 'origin_platform': '4chan'}, 'technoviking': {'origin_year': '2006', 'origin_platform': 'YouTube'}, 'miss teen usa south carolina': {'origin_year': '2007', 'origin_platform': 'YouTube'}, 'chocolate rain': {'origin_year': '2007', 'origin_platform': 'YouTube'}, 'crank that soulja boy': {'origin_year': '2007', 'origin_platform': 'YouTube'}, 'all your base are belong to us': {'origin_year': '1998', 'origin_platform': ''}, 'star wars kid': {'origin_year': '2002', 'origin_platform': ''}, 'o rly': {'origin_year': '2003', 'origin_platform': 'Something Awful'}, 'leave britney alone': {'origin_year': '2007', 'origin_platform': 'YouTube'}, 'i like turtles': {'origin_year': '2007', 'origin_platform': ''}, 'lolcats': {'origin_year': '2006', 'origin_platform': '4chan'}, 'dramatic chipmunk': {'origin_year': '2007', 'origin_platform': 'YouTube'}, 'montauk monster': {'origin_year': '2008', 'origin_plat

In [19]:
from os import error
from pytrends.request import TrendReq
import matplotlib.pyplot as plt

#initialize pytrends client
pytrends=TrendReq(hl='en-US', tz=360,timeout=(30,60))
namesList = list(memesDict.keys())
print(namesList)


def interest_df(names_list, time_frame='all',geo=''):

  for name in names_list:
    try:

      #for some memes emerged before 2007 Google Trends does not include the data;therefore, timeframe was selected manually
      if (int(memesDict[name]["origin_year"])<2007):
        pytrends.build_payload(
            kw_list=[name],
            cat=0,
            timeframe="2007-07-01 2013-12-30",
            geo=geo
        )

      else:
        pytrends.build_payload(
            kw_list=[name],
            cat=0,
            timeframe=f"{memesDict[name]["origin_year"]}-01-01 {str(int(memesDict[name]["origin_year"])+5)-01-01}",
            geo=geo
        )
      interest_data=pytrends.interest_over_time()



    except error as e:
      print(f"There is an error in procesess. Meme: {name}. Error:\n{e}")



['pedobear', 'rickroll', 'technoviking', 'miss teen usa south carolina', 'chocolate rain', 'crank that soulja boy', 'all your base are belong to us', 'star wars kid', 'o rly', 'leave britney alone', 'i like turtles', 'lolcats', 'dramatic chipmunk', 'montauk monster', 'this is relevant to my interests', 'bitches dont know']


'\ndef interest_df(names_list, time_frame=\'all\',geo=\'\'):\n\n  for name in names_list:\n    try:\n\n\n      if (memesList[name])\n      pytrends.build_payload(\n          kw_list=[name],\n          cat=0,\n          timeframe=time_frame,\n          geo=geo\n      )\n      interest_data=pytrends.interest_over_time()\n      return interest_data\n\n    except error as e:\n      print(f"There is an error in procesess. Meme: {name}. Error:\n{e}")\n\n'

In [14]:
data_experiment

Unnamed: 0_level_0,rickroll,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2007-07-01,7,False
2007-07-08,7,False
2007-07-15,9,False
2007-07-22,8,False
2007-07-29,7,False
...,...,...
2008-11-30,25,False
2008-12-07,19,False
2008-12-14,17,False
2008-12-21,12,False
