In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim


[nltk_data] Downloading package wordnet to /home/traffic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/traffic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Bag Of Words

### Preprocessing 

In [8]:
df = pd.read_csv('analisis_comments_tiktok.csv')# Parameters tuning using Grid Search

def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

comments = df.comment.tolist()
comments_prep = [preprocess(str(comment)) for comment in comments]
comments_dict = gensim.corpora.Dictionary(comments_prep)
bow_corpus = [comments_dict.doc2bow(doc) for doc in comments_prep]

In [9]:
comments

[nan,
 'it’s not good chegg apparently rats out on students',
 nan,
 'I hope Audri looks different at that age',
 'I absolutely love it!',
 'so this is what Audri will look like as a Freshman in College? 🤣🤣🤣🤣🤣',
 'Oh my I hope not 😂',
 'Math is hard 😳😳😳',
 'This was me. I cannot MATH',
 'I can history all day long but I cannot math or science… and sometimes I can’t English 😂 what about you?!?',
 'This is so neat! @Chegg for the win!',
 "That is a 7th grade problem😑 Won't work on my maths.",
 'Why you holding a charger up to your mouth',
 'I thought you were a nun',
 'same oh my gosh is this an actual sponsor thing or a joke',
 'wait how am i so early',
 'GET 5 DOLLARS OFF your first month of Chegg Study Pack using the L1NK IN MY B1O!!!!📚🌙✨',
 'Early',
 'I thought this was a fake add-',
 'I can never tell if your joking or not HELP-',
 nan,
 '🥰💝🇺🇸 no 🇺🇸🇬🇧🇨🇦😄😱🥰',
 'YI殺Where are you guys❤️',
 'chegg is terrible. they log your ip and send it to schools. so many kids have been honor coded c

In [10]:
'''
Checking dictionary created
'''
count = 0
for k, v in comments_dict.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 appar
1 chegg
2 good
3 rat
4 student
5 audri
6 differ
7 hope
8 look
9 absolut
10 love


### Training the model

In [11]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = comments_dict,                                    
                                   passes = 10,
                                   workers = 2)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.055*"love" + 0.048*"video" + 0.023*"like" + 0.018*"look" + 0.017*"good" + 0.016*"colleg" + 0.012*"great" + 0.011*"sydney" + 0.011*"watch" + 0.009*"ayyyi"


Topic: 1 
Words: 0.029*"chegg" + 0.020*"love" + 0.018*"think" + 0.014*"school" + 0.010*"happi" + 0.009*"amaz" + 0.008*"like" + 0.008*"know" + 0.007*"class" + 0.007*"kind"


Topic: 2 
Words: 0.019*"help" + 0.016*"chegg" + 0.016*"math" + 0.010*"liter" + 0.010*"free" + 0.010*"love" + 0.009*"year" + 0.009*"want" + 0.008*"friend" + 0.008*"peopl"


Topic: 3 
Words: 0.045*"earli" + 0.045*"thank" + 0.013*"repli" + 0.013*"know" + 0.009*"work" + 0.009*"come" + 0.009*"get" + 0.009*"beauti" + 0.008*"colleg" + 0.008*"thing"




### Testing the model

In [12]:
unseen_document = 'I hate Chegg. What a failure!'# Data preprocessing step for the unseen document
bow_vector = comments_dict.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(f"Topic: {index} Score: {score}\t, Topic detail:{lda_model.print_topic(index, 5)}")

Topic: 1 Score: 0.7441675662994385	, Topic detail:0.029*"chegg" + 0.020*"love" + 0.018*"think" + 0.014*"school" + 0.010*"happi"
Topic: 2 Score: 0.08892345428466797	, Topic detail:0.019*"help" + 0.016*"chegg" + 0.016*"math" + 0.010*"liter" + 0.010*"free"
Topic: 3 Score: 0.08350317925214767	, Topic detail:0.045*"earli" + 0.045*"thank" + 0.013*"repli" + 0.013*"know" + 0.009*"work"
Topic: 0 Score: 0.08340579271316528	, Topic detail:0.055*"love" + 0.048*"video" + 0.023*"like" + 0.018*"look" + 0.017*"good"


# TFIDF 

### Preprocessing

In [13]:
docs_raw = df.comment.fillna('').tolist()
print(len(docs_raw))
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)

1096




In [14]:
dtm_tfidf[0]

<1x41 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

### Training The Model 

In [15]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=5, random_state=0)

### Visualizing The Result 

In [16]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  default_term_info = default_term_info.sort_values(


In [17]:
unseen_document = 'I hate Chegg. What a failure!'# Data preprocessing step for the unseen document
bow_vector = comments_dict.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(f"Topic: {index} Score: {score}\t, Topic detail:{lda_model.print_topic(index, 5)}")

Topic: 1 Score: 0.7441678047180176	, Topic detail:0.029*"chegg" + 0.020*"love" + 0.018*"think" + 0.014*"school" + 0.010*"happi"
Topic: 2 Score: 0.08892323821783066	, Topic detail:0.019*"help" + 0.016*"chegg" + 0.016*"math" + 0.010*"liter" + 0.010*"free"
Topic: 3 Score: 0.08350318670272827	, Topic detail:0.045*"earli" + 0.045*"thank" + 0.013*"repli" + 0.013*"know" + 0.009*"work"
Topic: 0 Score: 0.08340580016374588	, Topic detail:0.055*"love" + 0.048*"video" + 0.023*"like" + 0.018*"look" + 0.017*"good"
