In [29]:
import numpy as np

import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.preprocessing import MultiLabelBinarizer

# Topic Modeling for Food-Drink Emojis (20 pts)

In the assignment, you will be asked to perform LDA on our food-drink emoji dataset. We have helped you vectorized the data set into a matrix and all you need to do is to run LDA and interpret your result.

Your task: run LDA on the emojis in the food-drink emoji dataset. Try different numbers of topics and select the number that you believe best explains the topic structure in the food-drink emojis.

In [30]:
tweets_df = pd.read_csv("food_drink_emoji_tweets.txt", sep="\t", header=None).head(10000)
tweets_df.columns = ['text']
tweets_df.head()

Unnamed: 0,text
0,RT @CalorieFixess: 🍗🌯🍔🍒 400 Calories https://t...
1,RT @1_F_I_R_S_T: _ 🍈¹〉Grow your account fast! ...
2,RT @LegendDeols: 👉👉👉G€T Ready to dance💃🕺🕺🕺💃💃💃 ...
3,@britch_x Hubby's friend bought us Wendy's-che...
4,RT @DAILYPUPPlES: Workout partner ☕🍌😍 https://...


In [31]:
emoji_by_category = {
    "food_fruit": "🍇🍈🍉🍊🍋🍌🍍🥭🍎🍏🍐🍑🍒🍓🥝🍅🥥",
    "food_vegetable": "🥑🍆🥔🥕🌽🌶🥒🥬🥦🍄🥜🌰",
    "food_prepared": "🍞🥐🥖🥨🥯🥞🧀🍖🍗🥩🥓🍔🍟🍕🌭🥪🌮🌯🥙🥚🍳🥘🍲🥣🥗🍿🧂🥫",
    "food_asian": "🍱🍘🍙🍚🍛🍜🍝🍠🍢🍣🍤🍥🥮🍡🥟🥠🥡",
    "food_marian": "🦀🦞🦐🦑",
    "food_sweet": "🍦🍧🍨🍩🍪🎂🍰🧁🥧🍫🍬🍭🍮🍯",
    "drink": "🍼🥛☕🍵🍶🍾🍷🍸🍹🍺🍻🥂🥃"
}

emoji_set = set()
for emoji_list in emoji_by_category.values():
    emoji_set.update(emoji_list)

In [32]:
def extract_uniq_emojis(text):
    return np.unique([chr for chr in text if chr in emoji_set])

tweets_df['emojis'] = tweets_df.text.apply(extract_uniq_emojis)

tweets_df.head()

Unnamed: 0,text,emojis
0,RT @CalorieFixess: 🍗🌯🍔🍒 400 Calories https://t...,"[🌯, 🍒, 🍔, 🍗]"
1,RT @1_F_I_R_S_T: _ 🍈¹〉Grow your account fast! ...,"[🍇, 🍈, 🍉, 🍊, 🍍, 🍓]"
2,RT @LegendDeols: 👉👉👉G€T Ready to dance💃🕺🕺🕺💃💃💃 ...,"[🍸, 🥃]"
3,@britch_x Hubby's friend bought us Wendy's-che...,"[🍔, 🍟]"
4,RT @DAILYPUPPlES: Workout partner ☕🍌😍 https://...,"[☕, 🍌]"


In [33]:
mlb = MultiLabelBinarizer()
emoji_matrix = pd.DataFrame(data=mlb.fit_transform(tweets_df.emojis),
                            index=tweets_df.index,
                            columns=mlb.classes_)
emoji_matrix.head()

Unnamed: 0,☕,🌭,🌮,🌯,🌰,🌶,🌽,🍄,🍅,🍆,...,🥭,🥮,🥯,🦀,🦐,🦑,🦞,🧀,🧁,🧂
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Implement your code below, 
# lda = ??
data = mlb.fit_transform(tweets_df.emojis)
lda = LatentDirichletAllocation(n_components=6, random_state=0)
lda.fit(data)
lda.components_

array([[4.91356730e+02, 1.66746847e-01, 1.66734014e-01, 1.66739622e-01,
        1.66934797e-01, 1.66850060e-01, 1.66750966e-01, 1.66750054e-01,
        1.66789589e-01, 1.66719619e-01, 1.93728494e-01, 1.66710824e-01,
        1.66852032e-01, 1.66770559e-01, 1.66764594e-01, 1.66721390e-01,
        1.66708046e-01, 1.66777280e-01, 1.66704374e-01, 1.66699049e-01,
        1.66751174e-01, 1.66880601e-01, 1.66799379e-01, 1.66743204e-01,
        1.66863541e-01, 1.66841053e-01, 1.66860664e-01, 1.66848771e-01,
        1.66748020e-01, 1.66760820e-01, 1.66765960e-01, 1.66829196e-01,
        1.66881303e-01, 1.66875141e-01, 1.66720696e-01, 1.66709859e-01,
        1.66857661e-01, 1.66826282e-01, 1.66700028e-01, 1.66799055e-01,
        1.66875212e-01, 1.66944760e-01, 1.67004256e-01, 1.66908539e-01,
        1.67232884e-01, 1.66938446e-01, 1.89014674e-01, 1.66807316e-01,
        1.66863983e-01, 7.08669734e-01, 1.66881955e-01, 6.21263268e+02,
        1.66769547e-01, 1.66785606e-01, 1.66970707e-01, 5.183276

You may reuse the `display_topics` function to output the emojis of the highest weight in each topics.

In [42]:
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [43]:
# Uncomment the following command to output the topics.
emoji_names = mlb.classes_
display_topics(lda, emoji_names, 10)

Topic 0:
🎂 🍾 🥂 🍷 🍰 ☕ 🍸 🍹 🥧 🥃
Topic 1:
🍻 🍔 🍺 🍹 🍸 🍳 🥃 🥞 🥓 🍶
Topic 2:
🍉 🍍 🍇 🍞 🥝 🥩 🥑 🍊 🍜 🥓
Topic 3:
🍦 🍭 🍫 🍬 🍩 🍨 🍰 🍧 🍅 🥒
Topic 4:
🍟 🍔 🍕 🍗 🍪 🌭 🌮 🧀 🍯 🍝
Topic 5:
🍑 🍓 🍆 🍒 🍏 🍎 🍌 🍋 🍊 🍇


In the following cell, describe the number of topics you choose, and interpret the topics you discovered.

In [44]:
# Put your answers here.
lda.transform(data[:10])


array([[0.03333334, 0.0335744 , 0.0333637 , 0.03333334, 0.63306322,
        0.23333201],
       [0.02380953, 0.02380953, 0.47355483, 0.02380953, 0.02380953,
        0.43120706],
       [0.05592423, 0.72185352, 0.05555557, 0.05555556, 0.05555556,
        0.05555556],
       [0.05555556, 0.05621845, 0.05555556, 0.05555556, 0.72155931,
        0.05555556],
       [0.38864212, 0.05555556, 0.05557686, 0.05555624, 0.05580128,
        0.38886794],
       [0.72222221, 0.05555556, 0.05555556, 0.05555556, 0.05555556,
        0.05555556],
       [0.04166667, 0.04166667, 0.04166719, 0.29163451, 0.04169783,
        0.54166713],
       [0.05555556, 0.05555556, 0.72141082, 0.05555556, 0.05555556,
        0.05636693],
       [0.15478717, 0.01851852, 0.01851852, 0.65625466, 0.13340261,
        0.01851852],
       [0.03333334, 0.03333334, 0.03339115, 0.03335817, 0.83325065,
        0.03333334]])

In [40]:
tweets_df.emojis[0]

array(['🌯', '🍒', '🍔', '🍗'], dtype='<U1')

In [45]:
Topic 0 - Celebratory food
Topic 1 - Bar food
Topic 2 - Healthy food
Topic 3 - Sweet foods
Topic 4 - Junk food
Topic 5 - Fruits
I decided on 6 topics because each bracket had at least one topic over .4, indicating that 
the tweet related most with that topic.

SyntaxError: invalid syntax (<ipython-input-45-1e6024a27d13>, line 1)