In [108]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import numpy as np
import random

import plotly.express as px

In [109]:
path = './data/'
shein_df = pd.read_csv(path+'shein_data_clean.csv')

In [110]:
shein_df['title_preprocessed'] = shein_df['title'].str.lower()
shein_df['title_preprocessed'] = [i.replace('shein', '') for i in shein_df['title_preprocessed']]

In [111]:
cv = CountVectorizer(max_df=0.90, min_df=2, stop_words='english')

In [112]:
dtm = cv.fit_transform(shein_df['title_preprocessed'])

In [118]:
n_components = 26
LDA = LatentDirichletAllocation(n_components=n_components,random_state=42)

In [119]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=26, random_state=42)

In [120]:
len(cv.get_feature_names())

589

In [121]:
lda_summary = pd.DataFrame()
n_important_words = 8
for i in range(n_components): 
    
    index = LDA.components_[i].argsort()[-n_important_words:]
    importance = np.sort(LDA.components_[i])[-n_important_words:]

    tmp_df = pd.DataFrame({
        'importance': importance,
        'topic': [i for j in range(n_important_words)],
        'feature': [cv.get_feature_names()[j] for j in index]
    })
    
    lda_summary = pd.concat([lda_summary, tmp_df], ignore_index=True)

In [122]:
fig = px.bar(lda_summary[lda_summary.topic.isin(range(0, 9))], y='feature', x='importance', color='topic', 
             facet_col='topic', facet_col_wrap=3, facet_col_spacing=0.1, height=700)
fig.update_yaxes(matches=None, showticklabels=True, title='')
fig.update_xaxes(matches=None, showticklabels=True, title='')
fig.update_layout(coloraxis_showscale=False)

In [123]:
topic_results = LDA.transform(dtm)

In [99]:
shein_df['topic_lda'] = topic_results.argmax(axis=1)

In [100]:
shein_df['topic_lda'].value_counts()

6     236
19    231
12    207
16    201
22    197
7     185
2     151
8     150
11    141
3     138
23    136
15    131
13    126
21    121
10    116
1     108
5      94
20     88
18     86
24     82
17     81
25     76
4      74
9      55
14     46
0      41
Name: topic_lda, dtype: int64