# **Gathering Data**

In this notebook I will use HTTP requests and BeautifulSoup to get information about the random selected memes from KnowYourMeme.com (Origin date and platform). Then I will use pytrends (i.e. Google Trends python library) to get data related to the interest to each meme. Lastly, every information will be included in a single csv file in which rows are memes and columns are origin year, origin platform and days of staying trend.

In this project, a meme is considered out of trends when its popularity decreases to 15% of its maximum popularity.

In [None]:
!pip install requests==2.25.1

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
import tqdm
from urllib.parse import urljoin


#This part creates a module for inserting numbers in url for different pages in chronological order.
#Example is given below
#https://knowyourmeme.com/memes/page/{page_number}?kind=all&sort=oldest

base_url="https://knowyourmeme.com/memes?kind=all&sort=oldest&page="
maximum_page_number=2787 #all pages including memes on the website

memes_urls=[]

for page_num in tqdm.tqdm(range(1,maximum_page_number+1),desc="Fetching Meme Data"):

  #This loop gets each meme's name in each page to create a specific url to the KnowYourMeme page of every meme

  current_url=base_url+str(page_num)

  res=requests.get(current_url)
  soup=BeautifulSoup(res.content,'html.parser')

  for a_tag in soup.select('a.item[href*="/memes/"]'):
    relative_url = a_tag.get('href')
    title=a_tag.get('data-title')
    if relative_url and title:
      full_url = urljoin("https://knowyourmeme.com", relative_url)
      memes_urls.append(full_url)


print(len(memes_urls)) #Expect around 44k memes

In [None]:
from IPython.utils import text

#Getting data from pages of each meme page
#Each Url looks like "https://knowyourmeme.com/memes/chocolate-rain"

memesDict={}

for url in tqdm.tqdm(memes_urls[::25],desc="Creating Meme Dictionary"):
  #Since processing 44k meme would require very high computational work, I have chosen every 25th meme starting from the beginning
  res=requests.get(url)
  soup=BeautifulSoup(res.content,'html.parser')

  #Code to get the origin year of the meme is given below

  origin_year=0

  year_data=soup.find('dt',string=re.compile(r'Year'))
  if year_data:
    year_dd=year_data.find_next_sibling('dd')
    if year_dd:
      year_link=year_dd.find('a')
      if year_link:
        origin_year=year_link.text.strip()

  #Code to get the origin platform of the meme is given below

  origin_platfrom=""

  origin_dt=soup.find('dt',string=re.compile(r'Origin'))
  if origin_dt:
    platfrom_dd=origin_dt.find_next_sibling('dd')
    if platfrom_dd:
      platfrom_link=platfrom_dd.find('a')
      if platfrom_link:
        origin_platfrom=platfrom_link.text.strip()

  #Creating a dictionary with the values we have scraped so that we can create csv easily

  name=url.split('/')[-1].replace("-"," ")

  memesDict[name]={
      'origin_year':origin_year,
      'origin_platform':origin_platfrom}

print(memesDict)

In [None]:
!pip install pytrends

In [None]:
from os import error
from pytrends.request import TrendReq
import matplotlib.pyplot as plt

#initialize pytrends client
pytrends=TrendReq(hl='en-US', tz=360,timeout=(30,60))
namesList = list(memesDict.keys())

for name in tqdm.tqdm(namesList,desc="Collecting interest data"):
  try:

    #for some memes emerged before 2007 Google Trends does not include the data;therefore, timeframe was selected manually
    if (int(memesDict[name]["origin_year"])<2007):
      pytrends.build_payload(
          kw_list=[name],
          cat=0,
          timeframe="2007-07-01 2011-12-30",
          geo=""
      )

    else:
      pytrends.build_payload(
          kw_list=[name],
          cat=0,
          timeframe=f"{memesDict[name]["origin_year"]}-01-01 {str(int(memesDict[name]["origin_year"])+4)}-01-01",
          geo=""
      )
    interest_data=pytrends.interest_over_time()

    if interest_data.empty or interest_data.shape[1]<1:
      weeks_of_trends=0
    else:
      #A meme is considered trend when it is search interest is higher than 0.15 of its maximum interest rate.
      #There is a natural assumption that if a meme is on KnowYourMeme.com, it is considered to be popular enough to be on trends
      max_trends_ratio=(interest_data.iloc[:,0].max())*0.15
      interest_data=interest_data[interest_data.iloc[:,0]>max_trends_ratio]
      weeks_of_trends=interest_data.shape[0]

    memesDict[name]["weeks_of_trends"]=weeks_of_trends

    time.sleep(random.uniform(5,10)) #modarate amount of time is waited between requests to avoid spam ban

  except error as e:
    print(f"There is an error in procesess. Meme: {name}. Error:\n{e}")



In [18]:
#Turning pandas DataFrames from the dictionary I obtained
df_memes=pd.DataFrame.from_dict(memesDict,orient='index')
df_memes.tail()

Unnamed: 0,origin_year,origin_platform,weeks_of_trends
slurmcore,2017,Tumblr,1.0
flip the camera trend,2025,,2.0
domer baby bart simpson,2025,,2.0
we are charlie kirk by spalexma,2025,,1.0
pedro alonso how life feels montage,2025,,1.0


In [20]:
#Creating a csv file from the DataFrame obtained above
df_memes.to_csv('meme_lifespan_data_raw.csv', index=True, encoding='utf-8')