# ILS-Z639: Social Media Mining
Paper 3: Topic Modelling and Sentiment Analysis

## Topic Modelling

---

### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np

from bertopic import BERTopic
import bertopic.plotting as bp
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.corpus import stopwords

import itertools
from typing import List, Union

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

from collections import Counter




### Defining some functions

#### Document visualization

In [2]:
def visualize_documents(topic_model,
                        docs: List[str],
                        topics: List[int] = None,
                        embeddings: np.ndarray = None,
                        reduced_embeddings: np.ndarray = None,
                        sample: float = None,
                        hide_annotations: bool = False,
                        hide_document_hover: bool = False,
                        custom_labels: Union[bool, str] = False,
                        title: str = "<b>Documents and Topics</b>",
                        width: int = 1200,
                        height: int = 750):

    topic_per_doc = topic_model.topics_

    # Sample the data to optimize for visualization and dimensionality reduction
    if sample is None or sample > 1:
        sample = 1

    indices = []
    for topic in set(topic_per_doc):
        s = np.where(np.array(topic_per_doc) == topic)[0]
        size = len(s) if len(s) < 100 else int(len(s) * sample)
        indices.extend(np.random.choice(s, size=size, replace=False))
    indices = np.array(indices)

    df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
    df["doc"] = [docs[index] for index in indices]
    df["topic"] = [topic_per_doc[index] for index in indices]

    # Extract embeddings if not already done
    if sample is None:
        if embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
        else:
            embeddings_to_reduce = embeddings
    else:
        if embeddings is not None:
            embeddings_to_reduce = embeddings[indices]
        elif embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")

    # Reduce input embeddings
    if reduced_embeddings is None:
        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)
        embeddings_2d = umap_model.embedding_
    elif sample is not None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings[indices]
    elif sample is None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings

    unique_topics = set(topic_per_doc)
    if topics is None:
        topics = unique_topics

    # Combine data
    df["x"] = embeddings_2d[:, 0]
    df["y"] = embeddings_2d[:, 1]

    # Prepare text and names
    if isinstance(custom_labels, str):
        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]
        names = ["_".join([label[0] for label in labels[:4]]) for labels in names]
        names = [label if len(label) < 30 else label[:27] + "..." for label in names]
    elif topic_model.custom_labels_ is not None and custom_labels:
        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
    else:
        names = [f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics]

    # Visualize
    fig = go.Figure()

    # Outliers and non-selected topics
    non_selected_topics = set(unique_topics).difference(topics)
    if len(non_selected_topics) == 0:
        non_selected_topics = [-1]

    selection = df.loc[df.topic.isin(non_selected_topics), :]
    selection["text"] = ""
    selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), "Other documents"]

    fig.add_trace(
        go.Scattergl(
            x=selection.x,
            y=selection.y,
            hovertext=selection.doc if not hide_document_hover else None,
            hoverinfo="text",
            mode='markers+text',
            name="other",
            showlegend=False,
            marker=dict(color='#CFD8DC', size=5, opacity=0.5)
        )
    )

    # Selected topics
    for name, topic in zip(names, unique_topics):
        if topic in topics:
            selection = df.loc[df.topic == topic, :]
            selection["text"] = ""

            if not hide_annotations:
                selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name]

            fig.add_trace(
                go.Scattergl(
                    x=selection.x,
                    y=selection.y,
                    hovertext=selection.doc if not hide_document_hover else None,
                    hoverinfo="text",
                    text=selection.text,
                    mode='markers+text',
                    name=name,
                    textfont=dict(
                        size=12,
                    ),
                    marker=dict(size=5, opacity=0.5)
                )
            )

    # Add grid in a 'plus' shape
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)

    # Stylize layout
    fig.update_layout(
        template="simple_white",
        title={
            'text': f"{title}",
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height
    )

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    return fig

#### Bar charts

In [3]:
def visualize_barchart(topic_model,
                       topics: List[int] = None,
                       top_n_topics: int = 8,
                       n_words: int = 5,
                       custom_labels: Union[bool, str] = False,
                       title: str = "<b>Topic Word Scores</b>",
                       width: int = 250,
                       height: int = 250) -> go.Figure:

    colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])

    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list()[0:6])

    # Initialize figure
    if isinstance(custom_labels, str):
        subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
        subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles]
        subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles]
    elif topic_model.custom_labels_ is not None and custom_labels:
        subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]
    else:
        subplot_titles = [f"Topic {topic}" for topic in topics]
    columns = 4
    rows = int(np.ceil(len(topics) / columns))
    fig = make_subplots(rows=rows,
                        cols=columns,
                        shared_xaxes=False,
                        horizontal_spacing=.1,
                        vertical_spacing=.4 / rows if rows > 1 else 0,
                        subplot_titles=subplot_titles)

    # Add barchart for each topic
    row = 1
    column = 1
    for topic in topics:
        words = [word + "  " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
        scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]

        fig.add_trace(
            go.Bar(x=scores,
                   y=words,
                   orientation='h',
                   marker_color=next(colors)),
            row=row, col=column)

        if column == columns:
            column = 1
            row += 1
        else:
            column += 1

    # Stylize graph
    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        title={
            'text': f"{title}",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width*4,
        height=height*rows if rows > 1 else height * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)

    return fig

#### Topic visualization

In [4]:
def visualize_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     custom_labels: Union[bool, str] = False,
                     title: str = "<b>Intertopic Distance Map</b>",
                     width: int = 650,
                     height: int = 650) -> go.Figure:

    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]
    if isinstance(custom_labels, str):
        words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list]
        words = ["_".join([label[0] for label in labels[:4]]) for labels in words]
        words = [label if len(label) < 30 else label[:27] + "..." for label in words]
    elif custom_labels and topic_model.custom_labels_ is not None:
        words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]
    else:
        words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])

    if topic_model.topic_embeddings_ is not None:
        embeddings = topic_model.topic_embeddings_[indices]
        embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)
    else:
        embeddings = topic_model.c_tf_idf_.toarray()[indices]
        embeddings = MinMaxScaler().fit_transform(embeddings)
        embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1],
                       "Topic": topic_list, "Words": words, "Size": frequencies})
    return _plotly_topic_visualization(df, topic_list, title, width, height)


def _plotly_topic_visualization(df: pd.DataFrame,
                                topic_list: List[str],
                                title: str,
                                width: int,
                                height: int):

    def get_color(topic_selected):
        marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list]
        return [{'marker.color': [marker_color]}]

    # Prepare figure range
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))

    # Plot topics
    fig = px.scatter(df, x="x", y="y", size="Size", size_max=40, template="simple_white", labels={"x": "", "y": ""},
                     hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False})
    fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color='DarkSlateGrey')))

    # Update hover order
    fig.update_traces(hovertemplate="<br>".join(["<b>Topic %{customdata[0]}</b>",
                                                 "%{customdata[1]}",
                                                 "Size: %{customdata[2]}"]))

    # Create a slider for topic selection
    steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list]
    sliders = [dict(active=0, pad={"t": 50}, steps=steps)]

    # Stylize layout
    fig.update_layout(
        title={
            'text': f"{title}",
            'y': .95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        xaxis={"visible": False},
        yaxis={"visible": False},
        sliders=sliders
    )

    # Update axes ranges
    fig.update_xaxes(range=x_range)
    fig.update_yaxes(range=y_range)

    # Add grid in a 'plus' shape
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
    fig.data = fig.data[::-1]

    return fig

### Setting up model parameters and defining the model

In [5]:
sw = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=sw)

In [6]:
model = BERTopic(
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    min_topic_size=10,
    nr_topics=10,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

### Sweat shop data

Preparing document data

In [7]:
df = pd.read_csv('sweat_shops.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

1650

Fitting the model and making predictions

In [8]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/52 [00:00<?, ?it/s]

2023-12-08 21:33:44,727 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:33:55,625 - BERTopic - Reduced dimensionality
2023-12-08 21:33:55,872 - BERTopic - Clustered reduced embeddings
2023-12-08 21:33:56,659 - BERTopic - Reduced number of topics from 38 to 10


Viewing topics

In [9]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,445,-1_clothes_like_fashion_im,"[clothes, like, fashion, im, people, fast, don...",[There's not really anything you can do to rem...
1,0,713,0_fashion_fast_fast fashion_clothes,"[fashion, fast, fast fashion, clothes, like, p...",[This is a more complicated issue than it seem...
2,1,268,1_like_looks_would_looks like,"[like, looks, would, looks like, mountain, mel...","[Kinda looks like a mountain too., Looks like ..."
3,2,82,2_books_like_read_book,"[books, like, read, book, tiktok, reading, peo...",[Some information on how Temu (a website like ...
4,3,53,3_job_work_church_would,"[job, work, church, would, take, hca, get, bac...","[Don't knowingly take a bad job, especially wi..."
5,4,23,4_poles_trade_didnt_contract,"[poles, trade, didnt, contract, deal, agent, r...",[Ok I don't want to hear anyone here complaini...
6,5,19,5_slow_lane_people_fast lane,"[slow, lane, people, fast lane, walkers, slow ...",[This shopping mall now has a fast lane in an ...
7,6,19,6_che_ghandi_racist_correct spelling,"[che, ghandi, racist, correct spelling, gandhi...",[Isn't the only evidence of Che's supposed rac...
8,7,14,7_list_grocery_shopping_even list,"[list, grocery, shopping, even list, shopping ...",[Can’t ever go without a list. I shop with my ...
9,8,14,8_eggs_chickens_get_hens,"[eggs, chickens, get, hens, roosters, eat, flo...",[No washing needed. They can stay in the coun...


In [10]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [11]:
visualize_topics(model)

### Fast fashion data

Preparing document data

In [12]:
df = pd.read_csv('fast_fashion.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

2155

Fitting the model and making predictions

In [13]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

2023-12-08 21:34:41,909 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:34:49,312 - BERTopic - Reduced dimensionality
2023-12-08 21:34:49,571 - BERTopic - Clustered reduced embeddings
2023-12-08 21:34:50,923 - BERTopic - Reduced number of topics from 39 to 10


Viewing topics

In [14]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,931,-1_fashion_fast_fast fashion_clothes,"[fashion, fast, fast fashion, clothes, like, p...",[Breaking up with fast fashion :\n\n1. Its a m...
1,0,723,0_fashion_fast_fast fashion_clothes,"[fashion, fast, fast fashion, clothes, like, b...","[Over the last year, I’ve gotten much more int..."
2,1,118,1_great_video_thank_climate town,"[great, video, thank, climate town, amazing, p...","[This is stunning, great job! The composition..."
3,2,103,2_clothing_stuff_usa_great,"[clothing, stuff, usa, great, quality, good, l...",[[Kali Rose Clothing](https://www.kalirose.eu/...
4,3,98,3_shes_melusine_like_lancelot,"[shes, melusine, like, lancelot, dragon, human...",[I could only get through about 5 minutes of t...
5,4,80,4_vegans_veganism_fashion_vegan,"[vegans, veganism, fashion, vegan, think, fast...","[This is generally true with some caveats, as ..."
6,5,36,5_books_book_tiktok_read,"[books, book, tiktok, read, like, people, read...","[I feel like a lot of people, particularly peo..."
7,6,23,6_battle pass_battle_pass_skin,"[battle pass, battle, pass, skin, pass skin, b...","[Its a battle pass skin so I hope not, No its ..."
8,7,22,7_student_students_money_data,"[student, students, money, data, discounts, te...",[I turn off any marketing emails. If you actua...
9,8,21,8_plt_gemma_indiyah_fashion,"[plt, gemma, indiyah, fashion, deal, fast fash...","[Well since you brought it up… it’s true, peop..."


In [15]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [16]:
visualize_topics(model)

### All brand data

Preparing the data

In [17]:
df = pd.read_csv('all_brands.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

2155

Fitting the model and making predictions

In [18]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

2023-12-08 21:35:43,735 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:35:52,081 - BERTopic - Reduced dimensionality
2023-12-08 21:35:52,509 - BERTopic - Clustered reduced embeddings
2023-12-08 21:35:54,255 - BERTopic - Reduced number of topics from 42 to 10


Viewing topics

In [19]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,881,-1_fashion_fast_clothes_fast fashion,"[fashion, fast, clothes, fast fashion, like, b...","[I live in the Midwest in America. Not rural, ..."
1,0,837,0_fashion_fast_fast fashion_like,"[fashion, fast, fast fashion, like, clothes, p...",[With anything when you get enough practice yo...
2,1,172,1_shes_like_melusine_lol,"[shes, like, melusine, lol, mario, human, love...",[I bought a blush from them after I saw severa...
3,2,71,2_good_quality_like_great,"[good, quality, like, great, brands, husky, se...","[If you have a Marshalls, Tjmaxx or Sierra nea..."
4,3,57,3_books_plt_gemma_like,"[books, plt, gemma, like, people, fashion, fas...","[Well since you brought it up… it’s true, peop..."
5,4,50,4_great_video_thank_climate town,"[great, video, thank, climate town, amazing, t...",[Aww yeah Climate Town! \n\nThis is an amazing...
6,5,23,5_battle pass_battle_pass_skin,"[battle pass, battle, pass, skin, pass skin, b...","[Its a battle pass skin so I hope not, No its ..."
7,6,22,6_student_students_money_discounts,"[student, students, money, discounts, companie...",[My girlfriend was better off as a student (mo...
8,7,21,7_eu_leading_good_environment,"[eu, leading, good, environment, fast food, fa...",[Everyday I consider moving to Europe more and...
9,8,21,8_vegans_veganism_vegan_animals,"[vegans, veganism, vegan, animals, fast, fast ...","[This is generally true with some caveats, as ..."


In [20]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [21]:
visualize_topics(model)

### Zara

Preparing the data

In [22]:
df = pd.read_csv('zara.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

616

Fitting the model and making predictions

In [23]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-08 21:36:09,038 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:36:12,396 - BERTopic - Reduced dimensionality
2023-12-08 21:36:12,479 - BERTopic - Clustered reduced embeddings
2023-12-08 21:36:12,846 - BERTopic - Reduced number of topics from 14 to 10


Viewing topics

In [24]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,203,-1_like_im_teddy_fashion,"[like, im, teddy, fashion, clothes, people, fa...",[I work in environmental sustainability 🖐️ I l...
1,0,261,0_fashion_fast_fast fashion_zara,"[fashion, fast, fast fashion, zara, clothes, b...",[I’ve thought about this as well and there’s a...
2,1,41,1_mario_thought_table_sad,"[mario, thought, table, sad, lol, mario furry,...",[And don't bring your ghost vaccum thing to th...
3,2,24,2_books_read_book_reading,"[books, read, book, reading, people, tiktok, l...",[Treating books like precious objects that hav...
4,3,22,3_vegans_veganism_vegan_fast,"[vegans, veganism, vegan, fast, animals, fast ...","[This is generally true with some caveats, as ..."
5,4,16,4_jeans_pants_pair_biker,"[jeans, pants, pair, biker, biker jeans, joint...",[Rather have the St Laurent joints. Also biker...
6,5,13,5_nazi_like_reich_looks,"[nazi, like, reich, looks, architecture, third...",[Imagine the largest flag you could buy and dy...
7,6,13,6_great_done_looks_please,"[great, done, looks, please, fill, done please...",[I’ll do yours now! You sure you have done it?...
8,7,12,7_ethan_hila_china_money,"[ethan, hila, china, money, care, information,...",[This THIS IS WHY IM WATCHING.\n\nBut seriousl...
9,8,11,8_girls_dont_day_climate change,"[girls, dont, day, climate change, like, made,...",[I feel like criticizing these girls is low ha...


In [25]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [26]:
visualize_topics(model)

### Nike

Preparing the data

In [27]:
df = pd.read_csv('nike.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

959

Fitting the model and making predictions

In [28]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2023-12-08 21:36:39,896 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:36:43,852 - BERTopic - Reduced dimensionality
2023-12-08 21:36:43,970 - BERTopic - Clustered reduced embeddings
2023-12-08 21:36:44,533 - BERTopic - Reduced number of topics from 23 to 10


Viewing topics

In [29]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,390,-1_clothes_fashion_like_fast,"[clothes, fashion, like, fast, fast fashion, p...",[I'm in a very similar boat as you! I try to d...
1,0,232,0_fashion_fast_fast fashion_clothes,"[fashion, fast, fast fashion, clothes, people,...",[This is a more complicated issue than it seem...
2,1,137,1_nike_shoes_batch_brand,"[nike, shoes, batch, brand, vt, 199 nike, 199,...",[ **VT batch ¥199 Nike Air Max Plus 36-47** ...
3,2,46,2_melusine_lancelot_white_temperature,"[melusine, lancelot, white, temperature, drago...",[Is rapi and red hood the same person? From th...
4,3,45,3_thanks_great_thank_great writer,"[thanks, great, thank, great writer, youre gre...","[This was fascinating, you’re a great writer, ..."
5,4,39,4_girls_product_cars_things,"[girls, product, cars, things, waste, companie...",[Why are you mad at these young women? People ...
6,5,23,5_books_read_reading_book,"[books, read, reading, book, people, tiktok, l...",[Treating books like precious objects that hav...
7,6,19,6_gundam_gunpla_iron blooded orphans_orphans,"[gundam, gunpla, iron blooded orphans, orphans...",[Iron blooded Orphans is up there in my top 3 ...
8,7,15,7_nazi_flag_reich_flags,"[nazi, flag, reich, flags, like, raise, third ...",[Imagine the largest flag you could buy and dy...
9,8,13,8_kyrie_kanye_adidas_jewish,"[kyrie, kanye, adidas, jewish, cut, antisemiti...",[One thing I've been reminded in all this dram...


In [30]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [31]:
visualize_topics(model)

### H&M

Preparing the data

In [32]:
df = pd.read_csv('hm.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

480

Fitting the model and making predictions

In [33]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

2023-12-08 21:36:55,500 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:36:58,288 - BERTopic - Reduced dimensionality
2023-12-08 21:36:58,365 - BERTopic - Clustered reduced embeddings
2023-12-08 21:36:58,513 - BERTopic - Reduced number of topics from 9 to 9


Viewing topics

In [34]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,79,-1_hm_like_mario_fashion,"[hm, like, mario, fashion, table, shirt, long ...",[We asked my SIL yesterday what her 3 year old...
1,0,28,0_books_read_reading_tiktok,"[books, read, reading, tiktok, book, people, l...",[Treating books like precious objects that hav...
2,1,23,1_sims_still_pack_remember,"[sims, still, pack, remember, one, cc, free, r...",[Yes! I bought every single expansion for Sims...
3,2,201,2_fashion_fast_fast fashion_people,"[fashion, fast, fast fashion, people, clothes,...",[I think you are missing the point of criticis...
4,3,15,3_sheep_look_painted_beach,"[sheep, look, painted, beach, scottish sheep, ...",[They look so serious with those painted sheep...
5,4,50,4_looks like_like_looks_guy,"[looks like, like, looks, guy, sunny, always s...",[Came here to say Hassan Minhaj called it out ...
6,5,16,5_hm_fashion_winter_clothes,"[hm, fashion, winter, clothes, day, stuff, use...","[I love her, but H&M is such a garbage company..."
7,6,44,6_like_hoodie_looks_wig,"[like, hoodie, looks, wig, bimini, looks like,...",[ like the outfit but the wig looks like it’s ...
8,7,24,7_gender_clothing_gender neutral_neutral,"[gender, clothing, gender neutral, neutral, wo...",[> But a better solution is doing away with ca...


In [35]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [36]:
visualize_topics(model)

### Shein

Preparing the data

In [37]:
df = pd.read_csv('shein.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

656

Fitting the model and making predictions

In [38]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

2023-12-08 21:37:15,788 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:37:18,893 - BERTopic - Reduced dimensionality
2023-12-08 21:37:18,988 - BERTopic - Clustered reduced embeddings
2023-12-08 21:37:19,350 - BERTopic - Reduced number of topics from 16 to 10


Viewing topics

In [39]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,259,-1_clothes_fashion_fast_fast fashion,"[clothes, fashion, fast, fast fashion, shein, ...",[I’ve thought about this as well and there’s a...
1,0,180,0_fashion_fast_shein_fast fashion,"[fashion, fast, shein, fast fashion, like, qua...","[Shein - Fast Fashion, Unfortunately SHEIN is ..."
2,1,48,1_garbage_funny_cat_sad,"[garbage, funny, cat, sad, pump, oh, thought, ...","[Sheint nu ook een fysieke winkel te zijn, I w..."
3,2,43,2_sustainable_buy_think_like,"[sustainable, buy, think, like, people, even, ...",[Would highly recommend the book [Fashionopoli...
4,3,34,3_shein_chinese_us_public,"[shein, chinese, us, public, stock, china, ipo...","[Goldman Sachs, JPMorgan Chase, and Morgan Sta..."
5,4,29,4_lettuce_tops_old navy_navy,"[lettuce, tops, old navy, navy, old, like, shi...",[I need influencers to start saying Old Navy i...
6,5,21,5_books_read_reading_tiktok,"[books, read, reading, tiktok, people, book, l...",[Treating books like precious objects that hav...
7,6,16,6_crochet_machine_crocheted_know,"[crochet, machine, crocheted, know, crochet it...","[If it makes you feel better, they are most li..."
8,7,15,7_nazi_reich_like_third reich,"[nazi, reich, like, third reich, flags, fascis...",[At first I was like “ok it’s an unlucky coinc...
9,8,11,8_girls_women_get_selling,"[girls, women, get, selling, roasties, poverty...",[I feel like criticizing these girls is low ha...


In [40]:
visualize_barchart(model, n_words=10,top_n_topics=3,  height=300, width=300)

In [41]:
visualize_topics(model)

### Adidas

Preparing the data

In [42]:
df = pd.read_csv('adidas.csv')

docs = []
docs.extend(list(df['Title']))
docs.extend(list(df['Text']))
for i in df['Comments'].values:
    docs.append(i)
docs = [str(doc) for doc in set(docs) if doc]

len(docs)

938

Fitting the model and making predictions

In [43]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2023-12-08 21:37:42,868 - BERTopic - Transformed documents to Embeddings
2023-12-08 21:37:47,783 - BERTopic - Reduced dimensionality
2023-12-08 21:37:47,896 - BERTopic - Clustered reduced embeddings
2023-12-08 21:37:48,457 - BERTopic - Reduced number of topics from 16 to 10


Viewing topics

In [44]:
pd.DataFrame(model.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,291,-1_like_fashion_fast fashion_fast,"[like, fashion, fast fashion, fast, brands, do...","[>Almost all of them are from Guangzhou, China..."
1,0,359,0_fashion_clothes_fast_fast fashion,"[fashion, clothes, fast, fast fashion, like, p...",[I'm in a very similar boat as you! I try to d...
2,1,76,1_adidas_kanye_shoes_kyrie,"[adidas, kanye, shoes, kyrie, ye, yeezy, line,...",[This what was said during the call. People wi...
3,2,54,2_melusine_mario_lancelot_dragon,"[melusine, mario, lancelot, dragon, temperatur...",[I still hate that we live in a timeline where...
4,3,41,3_watch_watches_fashion watches_looks,"[watch, watches, fashion watches, looks, timex...",[I love your choice! It looks incredibly cool....
5,4,33,4_vegan_vegans_veganism_fast,"[vegan, vegans, veganism, fast, fast fashion, ...","[This is generally true with some caveats, as ..."
6,5,29,5_football_factory_ted_lasso,"[football, factory, ted, lasso, ted lasso, fil...",[Ted Lasso. Not strictly /r/britishtv as it's ...
7,6,24,6_books_read_reading_book,"[books, read, reading, book, tiktok, people, l...",[Treating books like precious objects that hav...
8,7,18,7_jersey_team_fisherman_another,"[jersey, team, fisherman, another, back, islan...",[That new Bridgeport jersey is straight fire. ...
9,8,13,8_nazi_reich_flag_like,"[nazi, reich, flag, like, vibes, third reich, ...",[At first I was like “ok it’s an unlucky coinc...


In [45]:
visualize_barchart(model, n_words=10, top_n_topics=3, height=300, width=300)

In [46]:
visualize_topics(model)

---