In [1]:
import pandas as pd
from typing import Dict, List

In [2]:
CAPTIONS_PATH = "meme_database"

In [3]:
def read_categories():
    with open(f"{CAPTIONS_PATH}/categories.txt", "r") as f:
        categories = [line.strip() for line in f]
    return categories

categories = read_categories()

In [4]:
def read_category_memes(category_name: str) -> pd.DataFrame:
    file_name = f"{CAPTIONS_PATH}/{category_name}.json"
    return pd.read_json(file_name, lines="series")

meme_db = {
    category_name: read_category_memes(category_name)
    for category_name in categories
}

In [5]:
meme_count = {
    category_name: len(memes)
    for category_name, memes in meme_db.items()
}

print(f"Total count: {sum(x for x in meme_count.values())}")

Total count: 41127


In [6]:
two_buttons = pd.read_json("meme_database/Two-Buttons.json", lines="series")

In [7]:
print(two_buttons.info())
two_buttons.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021 entries, 0 to 2020
Data columns (total 2 columns):
id         2021 non-null object
caption    2021 non-null object
dtypes: object(2)
memory usage: 31.7+ KB
None


Unnamed: 0,id,caption
0,3dwvg7,play fortnight; play minecraft
1,2sga2r,2000+ PEOPLE IN FRONT OF YOU; IS THIS STILL LA...
2,33891a,other button; button
3,2ougwj,certain death; certain death
4,3akmdc,"receive $1,000,000 dollars illegally; receive ..."


In [8]:
two_buttons.drop_duplicates()

Unnamed: 0,id,caption
0,3dwvg7,play fortnight; play minecraft
1,2sga2r,2000+ PEOPLE IN FRONT OF YOU; IS THIS STILL LA...
2,33891a,other button; button
3,2ougwj,certain death; certain death
4,3akmdc,"receive $1,000,000 dollars illegally; receive ..."
...,...,...
2016,2sk3rj,or miss; Hit
2017,2po0g0,Become a furry; Meet God
2018,2qh8hk,BAN DANK MEMES; end world hunger
2019,2szn75,Watch a tree grow; Watch the super bowl halfti...


In [53]:
from nltk.tokenize import word_tokenize


def sanitize_caption(caption: str) -> str:
    """
    Make caption lowercase, remove non-alphabetical characters.
    """
    caption = caption.lower()
#     caption = "".join([letter for letter in caption if letter.isalpha() or letter == " "])
    return caption
    

def gather_vocabulary(captions: Dict[str, pd.DataFrame]) -> List[str]:
    vocabulary = {}
    for category_name, category_memes in captions.items():
        captions = category_memes["caption"].tolist()
        sanitized_captions = [sanitize_caption(caption) for caption in captions]
        tokenized_captions = [word_tokenize(caption) for caption in sanitized_captions]
        words = [word for caption in tokenized_captions for word in caption]
        
        vocabulary[category_name] = words
    return vocabulary

In [54]:
vocab = gather_vocabulary(meme_db)

In [56]:
from collections import Counter
from nltk.corpus import stopwords

english_stopwords = set(stopwords.words('english'))
for category, cat_vocab in vocab.items():
    # Remove stop words
    meaningful_words = filter(lambda word: word not in english_stopwords, cat_vocab)
    counted_words = Counter(meaningful_words)
    print(f"Top words for category: {category}")
    print(counted_words.most_common(10))
    print()

Top words for category: Woman-Yelling-At-Cat
[('!', 1637), (';', 1185), ('.', 419), (',', 406), ("'s", 251), ('?', 214), (':', 187), ("n't", 186), ('’', 169), ('said', 155)]

Top words for category: Surprised-Pikachu
[(':', 3270), (';', 2848), ('.', 343), ('!', 339), (',', 318), ('?', 296), ("n't", 216), ('mom', 182), ('teacher', 162), ('meme', 160)]

Top words for category: Two-Buttons
[(';', 2342), ('memes', 186), ('meme', 148), ('get', 133), ('make', 120), ('fortnite', 114), ('.', 103), ('!', 101), (',', 89), ('life', 83)]

Top words for category: Blank-Nut-Button
[(';', 1960), ('!', 191), (':', 184), ('button', 182), ('fortnite', 96), ('meme', 94), ('``', 94), ('mom', 86), ('memes', 85), ("''", 76)]

Top words for category: Distracted-Boyfriend
[(';', 3834), ('fortnite', 171), ('memes', 166), ('minecraft', 122), ('meme', 99), (',', 92), ("'s", 87), ('!', 78), ('people', 76), ('girl', 74)]

Top words for category: Tuxedo-Winnie-The-Pooh
[(';', 1833), (',', 329), ('.', 254), ('!', 11

In [18]:
"a".isalpha()

True

In [47]:
vocab