In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

from bertopic import BERTopic

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Lemmatizing the text
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

# converting sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# removing stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

In [6]:
path = '../data/'
shein_df = pd.read_csv(path+'shein_data_clean.csv')

In [7]:
shein_df.columns

Index(['link', 'price', 'title', 'photo_id', 'page_n', 'clothing_type'], dtype='object')

In [8]:
shein_df.clothing_type.value_counts()

clothing_type
costumes            360
tops                228
dresses             228
bottoms             228
two_piece_outfit    228
outwear             228
denim               228
beachwear           228
intimates           228
wedding             228
sweatshirts         227
activewear          226
maternity           217
sleep_and_lounge    216
Name: count, dtype: int64

In [15]:
# Converting the text to lowercase
shein_df['title'] = shein_df['title'].map(lambda x: x.lower())

# Lemmatization
# nltk.download('wordnet')
# nltk.download('omw-1.4')
shein_df['title_lemmatized'] = shein_df['title'].apply(lemmatize_text)

# Removing stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.append("shein")
data = shein_df['title_lemmatized'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anahitkhachatryan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
model = BERTopic(min_topic_size=20, n_gram_range=(1,4), verbose=True)

In [17]:
docs = data_words
docs = np.array([(" ").join(i) for i in docs])

In [18]:
labels, probs = model.fit_transform(docs)

Batches: 100%|██████████| 104/104 [00:05<00:00, 20.79it/s]
2023-05-27 13:04:14,311 - BERTopic - Transformed documents to Embeddings
2023-05-27 13:04:23,110 - BERTopic - Reduced dimensionality
2023-05-27 13:04:23,167 - BERTopic - Clustered reduced embeddings


In [19]:
shein_df['topic'] = labels

In [27]:
model

<bertopic._bertopic.BERTopic at 0x2a8cb6b50>

In [20]:
model.visualize_barchart(top_n_topics=12)

In [34]:
topic_names = model.get_topics()
topic_names

{-1: [('waist', 0.015601653667768793),
  ('slogan', 0.013038235526959112),
  ('set', 0.012987316353404091),
  ('short', 0.012983564260524405),
  ('mesh', 0.0129441767686192),
  ('drawstring', 0.012691523541122159),
  ('graphic', 0.011932961102646506),
  ('drawstring waist', 0.011684379467393449),
  ('without', 0.011404656849961371),
  ('thong', 0.011033347232606986)],
 0: [('dress', 0.030746625136066668),
  ('top', 0.02478894107287063),
  ('skirt', 0.019558267887184542),
  ('cami', 0.01750885017969808),
  ('hem', 0.016500607437802118),
  ('print', 0.016235621688495246),
  ('bodycon', 0.015819031652244214),
  ('split', 0.01562937414994512),
  ('backless', 0.01480895123595177),
  ('bridesmaid', 0.014225337961991899)],
 1: [('costume', 0.07898052944770996),
  ('pack', 0.05205097646133429),
  ('costume set', 0.04590661038739877),
  ('costume dress', 0.031602172754677166),
  ('set', 0.03129334438121195),
  ('costume set pack', 0.027618189854196135),
  ('set pack', 0.026562096054323515),
  (

In [22]:
shein_df[shein_df.topic == 25].clothing_type.value_counts()

clothing_type
beachwear    23
costumes      1
Name: count, dtype: int64

In [23]:
shein_df.topic.value_counts()

topic
 0     796
-1     306
 1     276
 2     190
 3     177
 4     151
 5     142
 6     119
 7     112
 8     109
 9      94
 10     89
 11     87
 12     83
 13     76
 14     67
 15     55
 16     54
 17     53
 18     51
 19     39
 20     37
 21     30
 22     28
 23     27
 24     26
 25     24
Name: count, dtype: int64

In [24]:
shein_df.to_csv(f'{path}/shein_data_with_topics.csv', index=False)

In [26]:
model.save(f"../models/topic_model", save_embedding_model=False)