# Data Preparation

In [None]:
import numpy as np
import pandas as pd

import re, string, unicodedata #regular expression
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
#import slang dictionary and transform it to dictionary
slang_dictionary = pd.read_csv('slang_dictionary.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

In [None]:
#import the data
df = pd.read_csv('data.csv')

# Preprocessing

In [None]:
#set stopwords
stop_words = set(stopwords.words('indonesian'))

In [None]:
#preprocessing functions

#transform slang words and abbreviations
def Slangwords(text):
    for word in text.split():
        if word in slang_dict.keys():
            text = text.replace(word, slang_dict[word])
    return text

#Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#remove stopwords
def RemoveStopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [word for word in word_tokens if not word in stop_words]
    return ' '.join(filtered_sentence)

#clean text from special characters, digits, and turn them into lowercase
def Clean(text):
    #remove non-ascii
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #remove punctuations
    text = re.sub(r'[^\w]|_',' ', text)
    
    #remove digits
    text = re.sub("\S*\d\S*", "", text).strip()
    text = re.sub(r"\b\d+\b", " ", text)
    
    #turn all words to lowercase
    text = text.lower()
    
    #remove additional white spaces
    text = re.sub('[\s]+', ' ', text)
    
    return text

In [None]:
#new column for processed text data
df['processed'] = ''

#run functions
for i, row in df.iterrows():
    cerita = df.cerita[i]
    result = Clean(cerita)
    result = Slangwords(result)
    result = stemmer.stem(result)
    result = RemoveStopwords(result)
    df['processed'][i] = result

# SK-LEARN Process

In [None]:
import pandas as pd

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

df_text = df['processed']

In [None]:
count_vectorizer = CountVectorizer(stop_words=final_stop_words, min_df=5)
words, word_values = get_top_n_words(n_top_words=15,
                                     count_vectorizer=count_vectorizer, 
                                     text_data=df_text)

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(range(len(words)), word_values);
ax.set_xticks(range(len(words)));
ax.set_xticklabels(words, rotation='vertical');
ax.set_title('Top words in headlines dataset (excluding stop words)');
ax.set_xlabel('Word');
ax.set_ylabel('Number of occurences');
plt.show()

# Bow Method

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(ngram_range=(1,2), tokenizer=word_tokenize, stop_words=final_stop_words, min_df=7, max_df=0.02)
bow_matrix = bow.fit_transform(df_text)

In [None]:
vocab_bow = bow.get_feature_names()
no_top_words = 10

# TFIDF Method

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=word_tokenize, stop_words=final_stop_words, min_df=7, max_df=0.02)
tfidf_matrix = tfidf.fit_transform(df_text)

In [None]:
vocab_tfidf = tfidf.get_feature_names()
no_top_words = 10

# Modeling

In [None]:
def display_topics(model, vocab, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([vocab[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
def get_topic(model):
    return [[vocab_bow[idx] for idx in reversed(comp.argsort()[-6:]) if vocab_bow[idx].isalnum()]
           for comp in model.components_]

# Latent Semantic Analysis (LSA)

In [None]:
#split data to 6 topics
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=6, random_state=42)
lsa_matrix = lsa.fit_transform(bow_matrix)

In [None]:
print(bow_matrix.shape) #hidden layer
print(lsa_matrix.shape) #weight/code
print(lsa.components_.shape) #fitur/topic 

In [None]:
#show topics
get_topic(lsa)

In [None]:
#display topics
display_topics(lsa, vocab_bow, no_top_words)

# Latent Dirichlet Allocation (LDA)

In [None]:
#split data to 6 topics
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=6, random_state=42)
lda_matrix = lda.fit_transform(bow_matrix)

In [None]:
get_topic(lda)

In [None]:
display_topics(lda, vocab_bow, no_top_words)

# NFM

In [None]:
#split data to 6 topics
from sklearn.decomposition import NMF

nfm = NMF(n_components=6, random_state=42)
nfm_matrix = nfm.fit_transform(tfidf_matrix)

In [None]:
get_topic(nfm)

In [None]:
display_topics(nfm, vocab_tfidf, no_top_words)