In [1]:
# Gensim library
# memakai gensim karena sudah support FastText
from gensim.models import FastText
from gensim.corpora import Dictionary
from gensim.models.keyedvectors import KeyedVectors
from numba import cuda

# NLTK library
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Common packages
import pandas as pd
import numpy as np
import ast
import re

import csv


In [2]:
# DEFINE CONSTANT
top_1_column = ['Rank 1']
top_5_column = ['Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5']
top_10_column = ['Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6', 'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10']

In [3]:
class BuildFasttext:
    def __init__(self, corpus):
        self.corpus = corpus

    def build_fastText(self):
        """_summary_
        Returns:
            _model(bin)_: _model fasttext_
        """
        print('====== BUILDING FASTTEXT MODEL IN PROCESS ! ======')
        model = FastText(sentences=self.corpus['unigram-bigram'])  # instantiate
        print('====== BUILDING FASTTEXT MODEL DONE ! ======')

        return model

    def build_rank_column(self, df_sim, rank):
        """_summary_

        Args:
            df_sim (_array_): _list dari similarity_
            rank (_int_): _rank yang dibutuhkan_

        Returns:
            _dataframe_: _data yang berisi kolum baru sesuai permintaan rank_
        """
        match rank:
            case 1:
                for column in top_1_column:
                    df_sim[column] = ''
                return df_sim
            case 5:
                for column in top_5_column:
                    df_sim[column] = ''
                return df_sim
            case 10:
                for column in top_10_column:
                    df_sim[column] = ''
                return df_sim

    def build_n_rank(self, df_sim_keys, model_input, rank):
        """_summary_
        Args:
            df_sim_keys (_array_): _list dari similarity_
            model_input (_file(bin)_): _model fasttext yang disimpan_
            rank (_int_): _definisi rank_
        Returns:
            _type_: _list dari rank words_
        """
        try:
            data_similarity = model_input.most_similar(df_sim_keys, topn=rank)
            word = [data_similarity[i][0] for i in range(len(data_similarity))]
            return word
        except:
            word = []
            return word

    def save_to_csv(self, df_sim, filename):
        df_sim.to_csv(filename, index=False)
        
    def save_model(self, model, name):
        model.wv.save(name)



In [4]:
def create_ngrams(dataset, n):
    if n == 3:
        dataset[f'unigram-bigram-trigram'] = ''
        ngramtokendf = dataset[f'unigram-bigram-trigram']
    if n == 2:
        dataset[f'unigram-bigram'] = ''
        ngramtokendf = dataset[f'unigram-bigram']
    if n == 1:
        dataset[f'unigram'] = ''
        ngramtokendf = dataset[f'unigram']
    indices_to_drop = []  # List to store indices of rows to be dropped

    for idx, text in enumerate(dataset['preprocess_final']):
        try:
            tokens = nltk.word_tokenize(text)
            ngrams_list = list(ngrams(tokens, n))
            if n == 1:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
            elif n == 2:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
                unigrams = tokens
                bigrams = [' '.join(gram) for gram in list(ngrams(tokens, 2))]
                ngrams_joined = unigrams + bigrams
            elif n == 3:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
                unigrams = tokens
                bigrams = [' '.join(gram) for gram in list(ngrams(tokens, 2))]
                trigrams = [' '.join(gram) for gram in list(ngrams(tokens, 3))]
                ngrams_joined = unigrams + bigrams + trigrams
            else:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
            ngramtokendf.iloc[idx] = ngrams_joined
        except TypeError:
                indices_to_drop = []  # List to store indices of rows to be dropped
    dataset = dataset.drop(indices_to_drop)
    return dataset

## BUILDING CORPUS BERITA

In [5]:
corpus_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext/corpus_berita_clean_final.csv')
corpus_berita = create_ngrams(corpus_berita, 2)

In [6]:
corpus_berita['unigram-bigram']

0         [wakil, gubernur, dki, djarot, syaiful, hidaya...
1         [badan, awas, milu, dki, tunggu, lapor, anggap...
2         [wakil, ketua, komisi, dpr, saleh, partaonan, ...
3         [pasang, calon, nomor, urut, anies, baswedan, ...
4         [rumah, partai, golkar, guncang, ujung, februa...
                                ...                        
140293    [tuhan, cipta, bangsa, maju, lawan, bohong, el...
140294    [laku, impi, dalam, berani, jenius, kuat, ajai...
140295    [juang, sejati, nilai, mula, bagamana, selesai...
140296    [jatuh, ndash, benar, sikap, kartini, jatuh nd...
140297    [suka, jujur, percaya, cinta, hormat, ali, bin...
Name: unigram-bigram, Length: 140298, dtype: object

In [7]:
constuctFastText = BuildFasttext(corpus_berita)
models = constuctFastText.build_fastText()



In [8]:
models_berita = models.wv

### TESTING CORPUS BERITA FASTTEXT MODEL AND BUILD TOPN SPREADSHEET

In [9]:
# testing corpus load model
models_berita.most_similar("lgbt")

[('lgbtq', 0.9788029193878174),
 ('lgbtiq', 0.9276920557022095),
 ('gay biseksual', 0.8281980752944946),
 ('lgbt lesbian', 0.819657564163208),
 ('lgv', 0.8174654841423035),
 ('biseksual', 0.8110087513923645),
 ('kaum lesbian', 0.8075315952301025),
 ('isu lesbian', 0.7828415036201477),
 ('kaum homoseksual', 0.7606260180473328),
 ('aseksual', 0.7514986395835876)]

In [10]:
# Get the vocabulary as a set
vocab = set(models_berita.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = models_berita.index_to_key
# print(keys)

478665


In [11]:
words = list(keys)
df_similarity_berita = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_berita)}')
print(f'data frame : {df_similarity_berita.shape}')
df_similarity_berita.head(3)

jumlah kata : 478665
data frame : (478665, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [12]:
# make copy of df_similarity
df_similarity_berita_top10 = df_similarity_berita.copy()
df_similarity_berita_top10 = constuctFastText.build_rank_column(df_similarity_berita_top10, 10)

print(f'df similarity top10 : {df_similarity_berita_top10.columns}')

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


In [13]:
for j in range(len(df_similarity_berita_top10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_berita_top10['Words'][j], models_berita, 10)
    df_similarity_berita_top10.loc[j, 'Rank 1'] = word[0]
    df_similarity_berita_top10.loc[j, 'Rank 2'] = word[1]
    df_similarity_berita_top10.loc[j, 'Rank 3'] = word[2]
    df_similarity_berita_top10.loc[j, 'Rank 4'] = word[3]
    df_similarity_berita_top10.loc[j, 'Rank 5'] = word[4]
    df_similarity_berita_top10.loc[j, 'Rank 6'] = word[5]
    df_similarity_berita_top10.loc[j, 'Rank 7'] = word[6]
    df_similarity_berita_top10.loc[j, 'Rank 8'] = word[7]
    df_similarity_berita_top10.loc[j, 'Rank 9'] = word[8]
    df_similarity_berita_top10.loc[j, 'Rank 10'] = word[9]

In [14]:
#  save each data to dataset
constuctFastText.save_to_csv(df_similarity_berita_top10,'../data/data_preprocessed/corpus_fasttext_topnrank/berita/df_similarity_top10_unigram_bigram.csv')

print(f'done!')

done!


## BUILDING CORPUS TWITTER-BERITA

In [5]:
corpus_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext/corpus_berita_clean_final.csv')
corpus_twitter = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')

corpus_tweetberita = pd.concat([corpus_berita, corpus_twitter], ignore_index=True)
# read csv
create_ngrams(corpus_tweetberita, 2)

Unnamed: 0,topik,sumber,url,isi,preprocess_clean,preprocess_case_folding,preprocess_stemmer,preprocess_normalization,preprocess_token,preprocess_final,tweet,label_fase_1,label_fase_2,label_fase_3,label_final,unigram-bigram
0,politik,cnnindonesia.com,http://cnnindonesia.com/kursipanasdki1/2017030...,"Jakarta, Wakil Gubernur DKI Jakarta Djarot Sy...",Jakarta Wakil Gubernur DKI Djarot Syaiful Hida...,jakarta wakil gubernur dki djarot syaiful hida...,jakarta wakil gubernur dki djarot syaiful hida...,jakarta wakil gubernur dki djarot syaiful hida...,"['wakil', 'gubernur', 'dki', 'djarot', 'syaifu...",wakil gubernur dki djarot syaiful hidayat ban ...,,,,,,"[wakil, gubernur, dki, djarot, syaiful, hidaya..."
1,politik,cnnindonesia.com,http://cnnindonesia.com/kursipanasdki1/2017030...,"Jakarta, Badan Pengawas Pemilu DKI Jakarta me...",Jakarta Badan Pengawas Pemilu DKI menunggu lap...,jakarta badan pengawas pemilu dki menunggu lap...,jakarta badan awas milu dki tunggu lapor dari ...,jakarta badan awas milu dki tunggu lapor dari ...,"['badan', 'awas', 'milu', 'dki', 'tunggu', 'la...",badan awas milu dki tunggu lapor anggap kartu ...,,,,,,"[badan, awas, milu, dki, tunggu, lapor, anggap..."
2,politik,cnnindonesia.com,http://cnnindonesia.com/politik/20170301132408...,"Jakarta, Wakil Ketua Komisi IX DPR, Saleh Par...",Jakarta Wakil Ketua Komisi DPR Saleh Partaonan...,jakarta wakil ketua komisi dpr saleh partaonan...,jakarta wakil ketua komisi dpr saleh partaonan...,jakarta wakil ketua komisi dpr saleh partaonan...,"['wakil', 'ketua', 'komisi', 'dpr', 'saleh', '...",wakil ketua komisi dpr saleh partaonan dulay p...,,,,,,"[wakil, ketua, komisi, dpr, saleh, partaonan, ..."
3,politik,cnnindonesia.com,http://cnnindonesia.com/kursipanasdki1/2017030...,"Jakarta, Pasangan calon nomor urut tiga, Anie...",Jakarta Pasangan calon nomor urut tiga Anies B...,jakarta pasangan calon nomor urut tiga anies b...,jakarta pasang calon nomor urut tiga anies bas...,jakarta pasang calon nomor urut tiga anies bas...,"['pasang', 'calon', 'nomor', 'urut', 'anies', ...",pasang calon nomor urut anies baswedan sandiag...,,,,,,"[pasang, calon, nomor, urut, anies, baswedan, ..."
4,politik,cnnindonesia.com,http://cnnindonesia.com/kursipanasdki1/2017022...,"Jakarta, Rumah Partai Golkar sedikit tergunca...",Jakarta Rumah Partai Golkar sedikit terguncang...,jakarta rumah partai golkar sedikit terguncang...,jakarta rumah partai golkar sedikit guncang uj...,jakarta rumah partai golkar sedikit guncang uj...,"['rumah', 'partai', 'golkar', 'guncang', 'ujun...",rumah partai golkar guncang ujung februari pic...,,,,,,"[rumah, partai, golkar, guncang, ujung, februa..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190134,,,,,gue tolol bgt kalo dipikir pikir,gue tolol bgt kalo dipikir pikir,gue tolol bgt kalo pikir pikir,saya tolol banget kalau pikir pikir,"['tolol', 'banget', 'pikir', 'pikir']",tolol banget pikir pikir,gue tolol bgt kalo dipikir pikir,HS,HS,HS,HS,"[tolol, banget, pikir, pikir, tolol banget, ba..."
190135,,,,,Lagu ganyambung sama event euro tolol,lagu ganyambung sama event euro tolol,lagu ganyambung sama event euro tolol,lagu ganyambung sama event euro tolol,"['lagu', 'ganyambung', 'event', 'euro', 'tolol']",lagu ganyambung event euro tolol,@TeamBTS14938305 @OmarKevin17 @Reel_good Lagu ...,HS,HS,HS,HS,"[lagu, ganyambung, event, euro, tolol, lagu ga..."
190136,,,,,PADAHAL UDH NGETWEET GINI MULU TAPI GUE TETEP ...,padahal udh ngetweet gini mulu tapi gue tetep ...,padahal udh ngetweet gin mulu tapi gue tetep m...,padahal sudah ngetweet gin melulu tapi saya te...,"['ngetweet', 'gin', 'melulu', 'tetep', 'minum'...",ngetweet gin melulu tetep minum kopi tolol,PADAHAL UDH NGETWEET GINI MULU TAPI GUE TETEP ...,HS,HS,HS,HS,"[ngetweet, gin, melulu, tetep, minum, kopi, to..."
190137,,,,,orang tolol megang pengorengan panas,orang tolol megang pengorengan panas,orang tolol megang oreng panas,orang tolol megang oreng panas,"['tolol', 'megang', 'oreng', 'panas']",tolol megang oreng panas,&lt;---- orang tolol megang pengorengan panas,HS,HS,HS,HS,"[tolol, megang, oreng, panas, tolol megang, me..."


In [6]:
constuctFastText = BuildFasttext(corpus_tweetberita)
model = constuctFastText.build_fastText()



In [7]:
model_tweetberita = model.wv

### TESTING CORPUS TWEETBERITA FASTTEXT MODEL AND BUILD TOPN SPREADSHEET

In [8]:
model_tweetberita.most_similar("tidur")

[('mitos tidur', 0.8141387104988098),
 ('tidur istirahat', 0.8001329302787781),
 ('tidur tidur', 0.7955479621887207),
 ('tidk', 0.7874872088432312),
 ('capek tidur', 0.7735072374343872),
 ('mimpi tidur', 0.7689397931098938),
 ('istirahat', 0.7585166096687317),
 ('tid', 0.7552476525306702),
 ('tidur nyenyak', 0.7464496493339539),
 ('thinkstock tidur', 0.7463703155517578)]

In [9]:
# Get the vocabulary as a set
vocab = set(model_tweetberita.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = model_tweetberita.index_to_key
# print(keys)

494160


In [10]:
words = list(keys)
df_similarity_tweetberita = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_tweetberita)}')
print(f'data frame : {df_similarity_tweetberita.shape}')
df_similarity_tweetberita.head(3)

jumlah kata : 494160
data frame : (494160, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [11]:
df_similarity_tweetberita_top10 = df_similarity_tweetberita.copy()
df_similarity_tweetberita_top10 = constuctFastText.build_rank_column(df_similarity_tweetberita_top10, 10)

# test
print(f'df similarity top10 : {df_similarity_tweetberita_top10.columns}')

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


In [12]:
df_similarity_tweetberita_top10.head(3)

Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,,,,,,,,,,
1,laku,,,,,,,,,,
2,indonesia,,,,,,,,,,


In [13]:
for j in range(len(df_similarity_tweetberita_top10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_tweetberita_top10['Words'][j], model_tweetberita, 10)
    df_similarity_tweetberita_top10.loc[j, 'Rank 1'] = word[0]
    df_similarity_tweetberita_top10.loc[j, 'Rank 2'] = word[1]
    df_similarity_tweetberita_top10.loc[j, 'Rank 3'] = word[2]
    df_similarity_tweetberita_top10.loc[j, 'Rank 4'] = word[3]
    df_similarity_tweetberita_top10.loc[j, 'Rank 5'] = word[4]
    df_similarity_tweetberita_top10.loc[j, 'Rank 6'] = word[5]
    df_similarity_tweetberita_top10.loc[j, 'Rank 7'] = word[6]
    df_similarity_tweetberita_top10.loc[j, 'Rank 8'] = word[7]
    df_similarity_tweetberita_top10.loc[j, 'Rank 9'] = word[8]
    df_similarity_tweetberita_top10.loc[j, 'Rank 10'] = word[9]

In [14]:
df_similarity_tweetberita_top10.head(3)

Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,pengalamn,tadalafil,adalagi,afdal,adala,amygdala,amdal,tidal,adal,kadaluwarsa
1,laku,melakukkan,imelakukan,melakukaan,dillakukan,dilakuin,dilakulan,dilakuakan,mlakukan,melakulan,melakuakan
2,indonesia,indonesi,indonesianis,indonesian,indonesien,indonesias,indoneia,indonesianisme,indoneaia,indonesie,indonesiancloud


In [15]:
constuctFastText.save_to_csv(df_similarity_tweetberita_top10, '../data/data_preprocessed/corpus_fasttext_topnrank/tweet_berita/df_similartiy_top10_unigram_bigram.csv')
print(f'done!')

done!


## BUILDING CORPUS TWEET

In [16]:
corpus_twitter = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')
corpus_twitter = create_ngrams(corpus_berita, 2)
corpus_twitter['unigram-bigram']

0         [wakil, gubernur, dki, djarot, syaiful, hidaya...
1         [badan, awas, milu, dki, tunggu, lapor, anggap...
2         [wakil, ketua, komisi, dpr, saleh, partaonan, ...
3         [pasang, calon, nomor, urut, anies, baswedan, ...
4         [rumah, partai, golkar, guncang, ujung, februa...
                                ...                        
140293    [tuhan, cipta, bangsa, maju, lawan, bohong, el...
140294    [laku, impi, dalam, berani, jenius, kuat, ajai...
140295    [juang, sejati, nilai, mula, bagamana, selesai...
140296    [jatuh, ndash, benar, sikap, kartini, jatuh nd...
140297    [suka, jujur, percaya, cinta, hormat, ali, bin...
Name: unigram-bigram, Length: 140298, dtype: object

In [17]:
constuctFastText = BuildFasttext(corpus_twitter)
model = constuctFastText.build_fastText()



In [18]:
model_tweet = model.wv

In [19]:
model_tweet.most_similar("ensiklopedi")

[('ortopedi', 0.7949845790863037),
 ('hipertensi', 0.7728633880615234),
 ('kardiologi', 0.7554982304573059),
 ('fisiologi', 0.7548489570617676),
 ('indepedensi', 0.7518232464790344),
 ('forensik', 0.7473798990249634),
 ('efesiensi', 0.7465602159500122),
 ('prolog', 0.745785117149353),
 ('audiensi', 0.7455964088439941),
 ('hedi', 0.7445394396781921)]

In [20]:
# Get the vocabulary as a set
vocab = set(model_tweet.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = model_tweet.index_to_key
# print(keys)

478665


In [21]:
words = list(keys)
df_similarity_tweet = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_tweet)}')
print(f'data frame : {df_similarity_tweet.shape}')
df_similarity_tweet.head(3)

jumlah kata : 478665
data frame : (478665, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [22]:
df_similarity_tweettop10 = df_similarity_tweet.copy()
df_similarity_tweettop10 = constuctFastText.build_rank_column(df_similarity_tweettop10, 10)

In [23]:
print(f'df similarity top10 : {df_similarity_tweettop10.columns}')
df_similarity_tweettop10.head(3)

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,,,,,,,,,,
1,laku,,,,,,,,,,
2,indonesia,,,,,,,,,,


In [None]:
for j in range(len(df_similarity_tweettop10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_tweettop10['Words'][j], model_tweet, 10)
    df_similarity_tweettop10.loc[j, 'Rank 1'] = word[0]
    df_similarity_tweettop10.loc[j, 'Rank 2'] = word[1]
    df_similarity_tweettop10.loc[j, 'Rank 3'] = word[2]
    df_similarity_tweettop10.loc[j, 'Rank 4'] = word[3]
    df_similarity_tweettop10.loc[j, 'Rank 5'] = word[4]
    df_similarity_tweettop10.loc[j, 'Rank 6'] = word[5]
    df_similarity_tweettop10.loc[j, 'Rank 7'] = word[6]
    df_similarity_tweettop10.loc[j, 'Rank 8'] = word[7]
    df_similarity_tweettop10.loc[j, 'Rank 9'] = word[8]
    df_similarity_tweettop10.loc[j, 'Rank 10'] = word[9]

In [None]:
df_similarity_tweettop10.head(3)

In [None]:
constuctFastText.save_to_csv(df_similarity_tweettop10, '../data/data_preprocessed/corpus_fasttext_topnrank/tweet/df_similartiy_top10_unigram_bigram.csv')

In [27]:
print("a")

a
