# Finding Themes in Indonesian Twitter with LSA and LDA

In [1]:
import pandas as pd
data = pd.read_csv('tweets.csv')
print(data.head())

                                           isi_tweet  sentimen
0  tidak setuju jokowi jadi cawapres capres jokow...         1
1  capres jokowi wacapres abraham samad gubernur ...         1
2  capres prabowo dan cawapres jokowi dan gubdki ...         1
3  jadi skenarionya gini 2014 biar prabowo jadi p...         1
4  sby mantan tni dan calon presiden prabowo subi...         1


In [2]:
data_text = data[['isi_tweet']]
data_text['index'] = data_text.index
tweets = data_text
print(tweets.head())
print("\n{} tweets in dataset".format(len(tweets)))

                                           isi_tweet  index
0  tidak setuju jokowi jadi cawapres capres jokow...      0
1  capres jokowi wacapres abraham samad gubernur ...      1
2  capres prabowo dan cawapres jokowi dan gubdki ...      2
3  jadi skenarionya gini 2014 biar prabowo jadi p...      3
4  sby mantan tni dan calon presiden prabowo subi...      4

1846 tweets in dataset


In [3]:
import csv
import re
import gensim
from gensim import corpora, models
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np
import Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
np.random.seed(45)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brianfrieerich/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
factory = StemmerFactory()
indoStemmer = factory.create_stemmer()

def indoStem(text):
    stemmed = indoStemmer.stem(text)
    return stemmed

sentences = ["Mereka meniru-nirukannya", "Saya pembantu", "Dia dipanggil oleh wanita tercantik"]

for sentence in sentences:
    print("-----\nOriginal: {}".format(sentence))
    output = indoStem(sentence)
    print("\nStemmed: {}".format(output))

-----
Original: Mereka meniru-nirukannya

Stemmed: mereka tiru
-----
Original: Saya pembantu

Stemmed: saya bantu
-----
Original: Dia dipanggil oleh wanita tercantik

Stemmed: dia panggil oleh wanita cantik


In [8]:
stopwords_list = []
with open('stopwords.csv', 'r') as f:
    reader = csv.reader(f)
    stopwords_list = list(reader) 
    flat_stoplist = [item for sublist in stopwords_list for item in sublist]

def preprocess(text):
    result=[]
    text = re.sub("[^a-zA-Z]+", " ", text)
    stemmed = indoStem(text)
    for word in stemmed.split(' '):
        if word not in flat_stoplist:
            result.append(word)
    return result

In [9]:
processed_tweets = list(map(preprocess, tweets['isi_tweet']))
processed_tweets[:5]

[['tuju', 'jokowi', 'cawapres', 'capres', 'jokowi', 'harga', 'mati'],
 ['capres',
  'jokowi',
  'wacapres',
  'abraham',
  'samad',
  'gubernur',
  'ahok',
  'koruptor',
  'abissss'],
 ['capres',
  'prabowo',
  'cawapres',
  'jokowi',
  'gubdki',
  'ahok',
  'mantap',
  'presiden',
  'sby',
  'bubar',
  'fpi'],
 ['skenario',
  'gin',
  'biar',
  'prabowo',
  'presiden',
  'jokowi',
  'tetepgubernur',
  'jakarta',
  'hasil',
  'nunggu',
  'gantiin',
  'prabowo'],
 ['sby',
  'mantan',
  'tni',
  'calon',
  'presiden',
  'prabowo',
  'subianto',
  'mantan',
  'kopassus',
  'anggoto',
  'tni',
  'disiplin',
  'smw',
  'presiden']]

In [10]:
dictionary = gensim.corpora.Dictionary(processed_tweets)

In [11]:
dictionary.filter_extremes(no_below = 15, no_above = 0.1)

In [12]:
bow_corpus = [dictionary.doc2bow(twt) for twt in processed_tweets]
bow_corpus[15]

[(2, 1), (3, 1), (20, 2), (22, 1)]

In [13]:
bow_tweet_14 = bow_corpus[15]
for i in range(len(bow_tweet_14)):
    print("Word {} (\"{}\") appears {} time.".format(bow_tweet_14[i][0], 
                                                     dictionary[bow_tweet_14[i][0]], 
                                                     bow_tweet_14[i][1]))

Word 2 ("ahok") appears 1 time.
Word 3 ("gubernur") appears 1 time.
Word 20 ("ya") appears 2 time.
Word 22 ("indonesia") appears 1 time.


In [14]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5104769343386324), (1, 0.8598914463513587)]


In [16]:
num_topics = 3

In [18]:
lsa_model = gensim.models.LsiModel(corpus_tfidf, 
                                   num_topics = num_topics, 
                                   id2word=dictionary)
for idx, topic in lsa_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.381*"jk" + 0.374*"hatta" + 0.259*"gubernur" + 0.225*"nyata" + 0.217*"tokoh" + 0.212*"cawapres" + 0.203*"sosok" + 0.200*"populer" + 0.200*"bincang" + 0.197*"ketua"


Topic: 1 
Words: -0.420*"hatta" + 0.319*"gubernur" + 0.297*"nyata" + 0.286*"sosok" + 0.283*"populer" + 0.282*"bincang" + 0.282*"tokoh" + -0.214*"buka" + -0.211*"pan" + -0.207*"ketua"


Topic: 2 
Words: -0.644*"jk" + 0.227*"hatta" + -0.215*"dahlan" + -0.205*"arb" + 0.188*"sosok" + 0.187*"gubernur" + 0.186*"populer" + 0.185*"bincang" + 0.169*"tokoh" + 0.168*"nyata"




In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = num_topics, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers = 4)
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.078*"dahlan" + 0.061*"arb" + 0.046*"pilih" + 0.038*"iskan" + 0.035*"rakyat" + 0.030*"wiranto" + 0.028*"dukung" + 0.027*"jk" + 0.024*"ya" + 0.020*"konvensi"


Topic: 1 
Words: 0.071*"jk" + 0.045*"gubernur" + 0.044*"calon" + 0.031*"tokoh" + 0.031*"nyata" + 0.030*"mahfud" + 0.028*"indonesia" + 0.027*"pdip" + 0.026*"sosok" + 0.025*"populer"


Topic: 2 
Words: 0.112*"hatta" + 0.052*"ketua" + 0.051*"cawapres" + 0.044*"pan" + 0.044*"buka" + 0.039*"evaluasi" + 0.039*"pencapresan" + 0.038*"radjasa" + 0.026*"survey" + 0.020*"tweet"




In [20]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics = num_topics, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)
                                             
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.068*"jk" + 0.041*"dukung" + 0.038*"wiranto" + 0.032*"mahfud" + 0.032*"pdip" + 0.026*"mega" + 0.022*"md" + 0.022*"kalah" + 0.020*"ical" + 0.020*"menang"


Topic: 1 
Words: 0.046*"hatta" + 0.038*"calon" + 0.037*"indonesia" + 0.033*"pilih" + 0.026*"wapres" + 0.026*"jakarta" + 0.025*"ketua" + 0.023*"ya" + 0.022*"buka" + 0.021*"tweet"


Topic: 2 
Words: 0.051*"dahlan" + 0.043*"arb" + 0.033*"rakyat" + 0.030*"nyata" + 0.029*"iskan" + 0.028*"gubernur" + 0.026*"sby" + 0.026*"sosok" + 0.025*"tokoh" + 0.025*"maju"




In [22]:
document_num = 13
print(data.iloc[document_num, 0])

print(processed_tweets[document_num])





prabowo subianto vs joko widodo no prabowo subianto presiden jokowi wapres yes aminin yaaaaa allah
['prabowo', 'subianto', 'vs', 'joko', 'widodo', 'no', 'prabowo', 'subianto', 'presiden', 'jokowi', 'wapres', 'yes', 'aminin', 'yaaaaa', 'allah']


In [23]:
for index, score in sorted(lsa_model[corpus_tfidf[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lsa_model.print_topic(index, 10)))


Score: 0.037299727710397364	 
Topic: 0.381*"jk" + 0.374*"hatta" + 0.259*"gubernur" + 0.225*"nyata" + 0.217*"tokoh" + 0.212*"cawapres" + 0.203*"sosok" + 0.200*"populer" + 0.200*"bincang" + 0.197*"ketua"

Score: -0.0010628898567282214	 
Topic: -0.420*"hatta" + 0.319*"gubernur" + 0.297*"nyata" + 0.286*"sosok" + 0.283*"populer" + 0.282*"bincang" + 0.282*"tokoh" + -0.214*"buka" + -0.211*"pan" + -0.207*"ketua"

Score: -0.03145815454014468	 
Topic: -0.644*"jk" + 0.227*"hatta" + -0.215*"dahlan" + -0.205*"arb" + 0.188*"sosok" + 0.187*"gubernur" + 0.186*"populer" + 0.185*"bincang" + 0.169*"tokoh" + 0.168*"nyata"


In [24]:
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8833066821098328	 
Topic: 0.112*"hatta" + 0.052*"ketua" + 0.051*"cawapres" + 0.044*"pan" + 0.044*"buka" + 0.039*"evaluasi" + 0.039*"pencapresan" + 0.038*"radjasa" + 0.026*"survey" + 0.020*"tweet"

Score: 0.06438399851322174	 
Topic: 0.071*"jk" + 0.045*"gubernur" + 0.044*"calon" + 0.031*"tokoh" + 0.031*"nyata" + 0.030*"mahfud" + 0.028*"indonesia" + 0.027*"pdip" + 0.026*"sosok" + 0.025*"populer"

Score: 0.052309323102235794	 
Topic: 0.078*"dahlan" + 0.061*"arb" + 0.046*"pilih" + 0.038*"iskan" + 0.035*"rakyat" + 0.030*"wiranto" + 0.028*"dukung" + 0.027*"jk" + 0.024*"ya" + 0.020*"konvensi"


In [25]:
for index, score in sorted(lda_model_tfidf[corpus_tfidf[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4878930151462555	 
Topic: 0.046*"hatta" + 0.038*"calon" + 0.037*"indonesia" + 0.033*"pilih" + 0.026*"wapres" + 0.026*"jakarta" + 0.025*"ketua" + 0.023*"ya" + 0.022*"buka" + 0.021*"tweet"

Score: 0.4027964174747467	 
Topic: 0.051*"dahlan" + 0.043*"arb" + 0.033*"rakyat" + 0.030*"nyata" + 0.029*"iskan" + 0.028*"gubernur" + 0.026*"sby" + 0.026*"sosok" + 0.025*"tokoh" + 0.025*"maju"

Score: 0.10931061953306198	 
Topic: 0.068*"jk" + 0.041*"dukung" + 0.038*"wiranto" + 0.032*"mahfud" + 0.032*"pdip" + 0.026*"mega" + 0.022*"md" + 0.022*"kalah" + 0.020*"ical" + 0.020*"menang"


In [28]:
fake_tweet = "Saya mendukung JK dan Kalla! PDI-P selamanya!"
bow_vector = dictionary.doc2bow(preprocess(fake_tweet))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8642910718917847	 
Topic: 0.068*"jk" + 0.041*"dukung" + 0.038*"wiranto" + 0.032*"mahfud" + 0.032*"pdip" + 0.026*"mega" + 0.022*"md" + 0.022*"kalah" + 0.020*"ical" + 0.020*"menang"

Score: 0.06800001859664917	 
Topic: 0.051*"dahlan" + 0.043*"arb" + 0.033*"rakyat" + 0.030*"nyata" + 0.029*"iskan" + 0.028*"gubernur" + 0.026*"sby" + 0.026*"sosok" + 0.025*"tokoh" + 0.025*"maju"

Score: 0.06770885735750198	 
Topic: 0.046*"hatta" + 0.038*"calon" + 0.037*"indonesia" + 0.033*"pilih" + 0.026*"wapres" + 0.026*"jakarta" + 0.025*"ketua" + 0.023*"ya" + 0.022*"buka" + 0.021*"tweet"
