In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim


[nltk_data] Downloading package wordnet to /home/traffic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/traffic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Bag Of Words

### Preprocessing 

In [22]:
gensim.utils.simple_preprocess('you are a fucking licking verfickter . motherfucker')

['you', 'are', 'fucking', 'licking', 'verfickter', 'motherfucker']

Index(['date_post', 'date_extraction', 'influencer', 'post_type', 'post_url',
       'platform', 'comment', 'comment_likecount', 'replies', 'BU;;;;;;;;;'],
      dtype='object')

In [30]:
df = pd.read_csv('analisis_comments_tiktok.csv')# Parameters tuning using Grid Search

def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

comments = df.comment.tolist()
comments_prep = [preprocess(str(comment)) for comment in comments]
comments_dict = gensim.corpora.Dictionary(comments_prep)
bow_corpus = [comments_dict.doc2bow(doc) for doc in comments_prep]

In [35]:
'''
Checking dictionary created
'''
count = 0
for k, v in comments_dict.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 appar
1 chegg
2 good
3 rat
4 student
5 audri
6 differ
7 hope
8 look
9 absolut
10 love


### Training the model

In [41]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = comments_dict,                                    
                                   passes = 10,
                                   workers = 2)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.043*"video" + 0.032*"love" + 0.012*"watch" + 0.012*"amaz" + 0.011*"repli" + 0.010*"like" + 0.010*"year" + 0.008*"hiii" + 0.008*"food" + 0.008*"youtub"


Topic: 1 
Words: 0.020*"chegg" + 0.017*"sydney" + 0.017*"love" + 0.017*"help" + 0.010*"look" + 0.010*"healthi" + 0.009*"colleg" + 0.009*"happi" + 0.009*"time" + 0.008*"pay"


Topic: 2 
Words: 0.025*"school" + 0.023*"chegg" + 0.017*"like" + 0.013*"think" + 0.012*"colleg" + 0.012*"help" + 0.011*"free" + 0.010*"person" + 0.009*"great" + 0.008*"guy"


Topic: 3 
Words: 0.042*"thank" + 0.042*"earli" + 0.040*"love" + 0.019*"know" + 0.011*"liter" + 0.010*"class" + 0.009*"question" + 0.009*"math" + 0.008*"nice" + 0.008*"colleg"




### Testing the model

In [46]:
unseen_document = 'I hate Chegg. What a failure!'# Data preprocessing step for the unseen document
bow_vector = comments_dict.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(f"Topic: {index} Score: {score}\t, Topic detail:{lda_model.print_topic(index, 5)}")

Topic: 2 Score: 0.7456414103507996	, Topic detail:0.025*"school" + 0.023*"chegg" + 0.017*"like" + 0.013*"think" + 0.012*"colleg"
Topic: 1 Score: 0.0861017256975174	, Topic detail:0.020*"chegg" + 0.017*"sydney" + 0.017*"love" + 0.017*"help" + 0.010*"look"
Topic: 0 Score: 0.0841335654258728	, Topic detail:0.043*"video" + 0.032*"love" + 0.012*"watch" + 0.012*"amaz" + 0.011*"repli"
Topic: 3 Score: 0.08412332832813263	, Topic detail:0.042*"thank" + 0.042*"earli" + 0.040*"love" + 0.019*"know" + 0.011*"liter"


# Vectorized Words 

### Preprocessing

In [54]:
docs_raw = df.comment.fillna('').tolist()
print(len(docs_raw))
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)

1096




### Training The Model 

In [59]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=6, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=6, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=6, random_state=0)

### Visualizing The Result 

In [60]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  default_term_info = default_term_info.sort_values(
