In [1]:
import csv
import pandas as pd
import re
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
df = pd.read_csv('id_df_1000_new_tranlated.csv', sep='\001', quoting=csv.QUOTE_NONE, encoding='utf-8')
df.head()

Unnamed: 0,uid,tweetid,txt,translation
0,13660,1135112490587574272,RT @onlymi97: 현장에 커피는 돌릴 수 없지만 알티 이벤트는 할 수 있습니...,RT @onlymi97: Coffee cannot be turned around o...
1,13660,1136278004517052416,RT @midampic: BLOOMIES!! WE MADE IT!!! ❤️❤️#프로...,RT @midampic: BLOOMIES!! WE MADE IT!!! ❤️❤️ #P...
2,13660,1132672414519705601,"""RT @9JuMoS2: ❤️RT 부탁드려요❤️스타쉽 구정모 연습생이 최종 데뷔가 ...","""RT @9JuMoS2: ❤️Please ask RT If❤️the final de..."
3,13660,1128679693119438849,RT @pooh_haha_: 💗도저히 모르겠는 이미담 = 도모담 타래💗,"RT @pooh_haha_: 💗I don't know, I don't know, I..."
4,13660,1134599718976118784,"""RT @Rown1102: 더 높게 올라갈만한데더 올라가야되는데미담이 잘했는데엄청 ...","""RT @Rown1102: I'm going to go up higher, but ..."


In [3]:
twitter_trans = df['translation']
# twitter_trans[3559]  

In [5]:
# preprocess
def preprocess_data(trans):
    twitter_list = []
    stopwords_english = stopwords.words('english') 
    for tweet in twitter_trans:  
        tweet = re.sub(r'@[^\s\n\r]+', '', tweet) 
        tweet = re.sub(r'[Hh]ttps?://[^\s\n\r]+', '', tweet) 
        tweet = re.sub(r'[-+,.()""/:]*', '', tweet) 
        tweet = re.sub(r'RT[\s]+', '', tweet)  
        tweet = tweet.split()
        tokens = [w for w in tweet if not w.lower() in stopwords_english]
        twitter_list.append(tokens)
    return twitter_list

twitter_list = preprocess_data(twitter_trans)
twitter_list

[['Coffee',
  'cannot',
  'turned',
  'around',
  'site',
  'alti',
  'events',
  'done',
  'AAPY',
  'Please',
  'remember',
  'trainees',
  'already',
  'announced',
  'winners',
  'spared',
  'small',
  'vote',
  'second',
  'launch',
  'day'],
 ['BLOOMIES!!',
  'MADE',
  'IT!!!',
  '❤️❤️',
  '#ProduceX101',
  '#Produce_X_101',
  '#이미담',
  '#LEEMIDAM',
  '#ProduceXMidam'],
 ['❤️Please',
  'ask',
  'If❤️the',
  'final',
  'debut',
  'confirmed',
  'give',
  'one',
  'alti',
  'AirPod',
  'gift❌',
  'deserves',
  'make',
  'debut'],
 ['💗I', 'know', 'know', "don't💗", 'know'],
 ["I'm",
  'going',
  'go',
  'higher',
  "I'm",
  'good',
  "I'm",
  'trainee',
  "who's",
  '36th',
  'place',
  'based',
  'students',
  "I'm",
  'laughing',
  'low',
  'ranking',
  'good'],
 ["I'm", 'today', 'to🤣', 'say', 'most~~', '😢'],
 ['Genius', 'among', 'trainees??', 'Yimyam~~I', 'already', 'know'],
 ['💍',
  '#비주얼센터',
  'Trainee',
  'Poll💍🔻Results🔻🥇',
  '#김우석',
  '#KIMWOOSEOK🥈',
  '#남도현',
  '#NAMDOHYUN🥉',

In [6]:
lda_topics = 10

# Train an LDA model 
common_dictionary = Dictionary(twitter_list)
common_corpus = [common_dictionary.doc2bow(text) for text in twitter_list]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=lda_topics,id2word = common_dictionary)
# Print 10 topics and first few(num_words) words.
lda.print_topics(num_topics=lda_topics,num_words =10)

[(0,
  '0.010*"place" + 0.010*"day" + 0.009*"real" + 0.008*"|" + 0.008*"get" + 0.007*"see" + 0.007*"Seoul" + 0.007*"great" + 0.006*"room" + 0.006*"Music"'),
 (1,
  '0.022*"Thank" + 0.012*"de" + 0.010*"life" + 0.010*"que" + 0.008*"Bulletproof" + 0.007*"love" + 0.007*"coming" + 0.006*"Yoon" + 0.006*"always" + 0.006*"thank"'),
 (2,
  '0.016*"want" + 0.010*"put" + 0.009*"2018" + 0.009*"One" + 0.009*"little" + 0.007*"run" + 0.007*"year" + 0.006*"Korea" + 0.006*"head" + 0.006*"see"'),
 (3,
  '0.013*"#BTS" + 0.009*"#방탄소년단" + 0.006*"comes" + 0.006*"ago" + 0.006*"little" + 0.006*"na" + 0.005*"Book" + 0.005*"#NCT" + 0.005*"used" + 0.005*"#JIMIN"'),
 (4,
  '0.084*"haha" + 0.077*"ha" + 0.056*"I\'m" + 0.029*"going" + 0.018*"like" + 0.013*"good" + 0.011*"see" + 0.011*"want" + 0.011*"go" + 0.010*"know"'),
 (5,
  '0.014*"@" + 0.011*"#EXO" + 0.011*"eat" + 0.007*"image" + 0.007*"thing" + 0.007*"#엑소" + 0.006*"1000" + 0.006*"Love" + 0.005*"black" + 0.005*"body"'),
 (6,
  '0.013*"Kim" + 0.012*"one" + 0.011

In [7]:
# Calculate the probabilities.
twitter_cop = [common_dictionary.doc2bow(text) for text in twitter_list]
topic_ev = lda.get_document_topics(twitter_cop,minimum_probability=0)
probabilities = [ [entry[1] for entry in doc] for doc in topic_ev ]
twitter_lda_score = {}
for i in range(lda_topics):  
    score = 0
    for docno in probabilities:
        score += docno[i]
    twitter_lda_score[i] = score/len(probabilities)
twitter_lda_order=sorted(twitter_lda_score.items(),key=lambda x:x[1],reverse=True) 
twitter_lda_order

[(4, 0.20012340143920743),
 (6, 0.12525427140945722),
 (2, 0.10388033616874966),
 (1, 0.10296261490998623),
 (0, 0.10290947791057853),
 (7, 0.10106021776809086),
 (3, 0.07843656607518339),
 (5, 0.06720109709921905),
 (9, 0.06335474623339254),
 (8, 0.054817273724489766)]

In [8]:
# Visualize the topics.
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, common_corpus, common_dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
