In [1]:
# Gensim library
# memakai gensim karena sudah support FastText
from gensim.models import FastText
from gensim.corpora import Dictionary
from gensim.models.keyedvectors import KeyedVectors
from numba import cuda

# NLTK library
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('punkt')


# Common packages
import pandas as pd
import numpy as np
import ast
import re

import csv


In [2]:
# DEFINE CONSTANT
top_1_column = ['Rank 1']
top_5_column = ['Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5']
top_10_column = ['Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6', 'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10']

In [3]:
class BuildFasttext:
    def __init__(self, corpus, column):
        self.corpus = corpus
        self.column = column

    def build_fastText(self):
        """_summary_
        Returns:
            _model(bin)_: _model fasttext_
        """
        print('====== BUILDING FASTTEXT MODEL IN PROCESS ! ======')
        model = FastText(sentences=self.corpus[self.column])  # instantiate
        print('====== BUILDING FASTTEXT MODEL DONE ! ======')

        return model

    def build_rank_column(self, df_sim, rank):
        """_summary_

        Args:
            df_sim (_array_): _list dari similarity_
            rank (_int_): _rank yang dibutuhkan_

        Returns:
            _dataframe_: _data yang berisi kolum baru sesuai permintaan rank_
        """
        if rank == 1:
            for column in top_1_column:
                    df_sim[column] = ''
            return df_sim
        
        elif rank == 5:
            for column in top_5_column:
                    df_sim[column] = ''
            return df_sim
        
        elif rank == 10:
            for column in top_10_column:
                    df_sim[column] = ''
            return df_sim

    def build_n_rank(self, df_sim_keys, model_input, rank):
        """_summary_
        Args:
            df_sim_keys (_array_): _list dari similarity_
            model_input (_file(bin)_): _model fasttext yang disimpan_
            rank (_int_): _definisi rank_
        Returns:
            _type_: _list dari rank words_
        """
        try:
            data_similarity = model_input.most_similar(df_sim_keys, topn=rank)
            word = [data_similarity[i][0] for i in range(len(data_similarity))]
            return word
        except:
            word = []
            return word

    def save_to_csv(self, df_sim, filename):
        df_sim.to_csv(filename, index=False)
        
    def save_model(self, model, name):
        model.wv.save(name)



In [4]:
def create_ngrams(dataset, n):
    if n == 3:
        dataset[f'unigram-bigram-trigram'] = ''
        ngramtokendf = dataset[f'unigram-bigram-trigram']
    if n == 2:
        dataset[f'unigram-bigram'] = ''
        ngramtokendf = dataset[f'unigram-bigram']
    if n == 1:
        dataset[f'unigram'] = ''
        ngramtokendf = dataset[f'unigram']
    indices_to_drop = []  # List to store indices of rows to be dropped

    for idx, text in enumerate(dataset['preprocess_final']):
        try:
            tokens = nltk.word_tokenize(text)
            ngrams_list = list(ngrams(tokens, n))
            if n == 1:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
            elif n == 2:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
                unigrams = tokens
                bigrams = [' '.join(gram) for gram in list(ngrams(tokens, 2))]
                ngrams_joined = unigrams + bigrams
            elif n == 3:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
                unigrams = tokens
                bigrams = [' '.join(gram) for gram in list(ngrams(tokens, 2))]
                trigrams = [' '.join(gram) for gram in list(ngrams(tokens, 3))]
                ngrams_joined = unigrams + bigrams + trigrams
            else:
                ngrams_joined = [' '.join(gram) for gram in ngrams_list]
            ngramtokendf.iloc[idx] = ngrams_joined
        except TypeError:
                indices_to_drop = []  # List to store indices of rows to be dropped
    dataset = dataset.drop(indices_to_drop)
    return dataset

## BUILDING CORPUS BERITA

In [5]:
corpus_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext/corpus_berita_clean_final.csv')
corpus_berita = create_ngrams(corpus_berita, 3)

In [6]:
constuctFastText = BuildFasttext(corpus_berita, 'unigram-bigram-trigram')
models = constuctFastText.build_fastText()



In [7]:
models_berita = models.wv

### TESTING CORPUS BERITA FASTTEXT MODEL AND BUILD TOPN SPREADSHEET

In [8]:
# testing corpus load model
models_berita.most_similar("lgbt")

[('lgbtq', 0.9830506443977356),
 ('lgbtiq', 0.9227940440177917),
 ('gay biseksual', 0.848452627658844),
 ('biseksual', 0.832297682762146),
 ('lesbi gay biseksual', 0.8255335092544556),
 ('lgbt lesbian', 0.8174042701721191),
 ('lgv', 0.8102040886878967),
 ('gay biseks', 0.7882265448570251),
 ('kaum homoseksual', 0.7808271050453186),
 ('kaum lesbian', 0.7790321111679077)]

In [9]:
# Get the vocabulary as a set
vocab = set(models_berita.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = models_berita.index_to_key
# print(keys)

617766


In [10]:
words = list(keys)
df_similarity_berita = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_berita)}')
print(f'data frame : {df_similarity_berita.shape}')
df_similarity_berita.head(3)

jumlah kata : 617766
data frame : (617766, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [11]:
# make copy of df_similarity
df_similarity_berita_top10 = df_similarity_berita.copy()
df_similarity_berita_top10 = constuctFastText.build_rank_column(df_similarity_berita_top10, 10)

print(f'df similarity top10 : {df_similarity_berita_top10.columns}')

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


In [12]:
for j in range(len(df_similarity_berita_top10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_berita_top10['Words'][j], models_berita, 10)
    df_similarity_berita_top10.loc[j, 'Rank 1'] = word[0]
    df_similarity_berita_top10.loc[j, 'Rank 2'] = word[1]
    df_similarity_berita_top10.loc[j, 'Rank 3'] = word[2]
    df_similarity_berita_top10.loc[j, 'Rank 4'] = word[3]
    df_similarity_berita_top10.loc[j, 'Rank 5'] = word[4]
    df_similarity_berita_top10.loc[j, 'Rank 6'] = word[5]
    df_similarity_berita_top10.loc[j, 'Rank 7'] = word[6]
    df_similarity_berita_top10.loc[j, 'Rank 8'] = word[7]
    df_similarity_berita_top10.loc[j, 'Rank 9'] = word[8]
    df_similarity_berita_top10.loc[j, 'Rank 10'] = word[9]

In [13]:
#  save each data to dataset
constuctFastText.save_to_csv(df_similarity_berita_top10,'../data/data_preprocessed/corpus_fasttext_topnrank/berita/df_similarity_top10_unigram_bigram_trigram.csv')

print(f'done!')

done!


## BUILDING CORPUS TWITTER-BERITA

In [14]:
corpus_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext/corpus_berita_clean_final.csv')
corpus_twitter = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')

corpus_tweetberita = pd.concat([corpus_berita, corpus_twitter], ignore_index=True)
# read csv
corpus_tweetberita = create_ngrams(corpus_tweetberita, 3)

In [15]:
constuctFastText = BuildFasttext(corpus_tweetberita, 'unigram-bigram-trigram')
model = constuctFastText.build_fastText()



In [16]:
model_tweetberita = model.wv

### TESTING CORPUS TWEETBERITA FASTTEXT MODEL AND BUILD TOPN SPREADSHEET

In [17]:
model_tweetberita.most_similar("tidur")

[('tidur tidur', 0.811089813709259),
 ('mitos tidur', 0.8039990067481995),
 ('mimpi tidur', 0.7906509637832642),
 ('capek tidur', 0.7803537845611572),
 ('dur', 0.7792797684669495),
 ('tidk', 0.7790694236755371),
 ('tidur lelah', 0.7751440405845642),
 ('tidur istirahat', 0.773717999458313),
 ('thinkstock tidur', 0.7692169547080994),
 ('tid', 0.7564488649368286)]

In [18]:
# Get the vocabulary as a set
vocab = set(model_tweetberita.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = model_tweetberita.index_to_key
# print(keys)

635336


In [19]:
words = list(keys)
df_similarity_tweetberita = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_tweetberita)}')
print(f'data frame : {df_similarity_tweetberita.shape}')
df_similarity_tweetberita.head(3)

jumlah kata : 635336
data frame : (635336, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [20]:
df_similarity_tweetberita_top10 = df_similarity_tweetberita.copy()
df_similarity_tweetberita_top10 = constuctFastText.build_rank_column(df_similarity_tweetberita_top10, 10)

# test
print(f'df similarity top10 : {df_similarity_tweetberita_top10.columns}')

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


In [21]:
df_similarity_tweetberita_top10.head(3)

Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,,,,,,,,,,
1,laku,,,,,,,,,,
2,indonesia,,,,,,,,,,


In [22]:
for j in range(len(df_similarity_tweetberita_top10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_tweetberita_top10['Words'][j], model_tweetberita, 10)
    df_similarity_tweetberita_top10.loc[j, 'Rank 1'] = word[0]
    df_similarity_tweetberita_top10.loc[j, 'Rank 2'] = word[1]
    df_similarity_tweetberita_top10.loc[j, 'Rank 3'] = word[2]
    df_similarity_tweetberita_top10.loc[j, 'Rank 4'] = word[3]
    df_similarity_tweetberita_top10.loc[j, 'Rank 5'] = word[4]
    df_similarity_tweetberita_top10.loc[j, 'Rank 6'] = word[5]
    df_similarity_tweetberita_top10.loc[j, 'Rank 7'] = word[6]
    df_similarity_tweetberita_top10.loc[j, 'Rank 8'] = word[7]
    df_similarity_tweetberita_top10.loc[j, 'Rank 9'] = word[8]
    df_similarity_tweetberita_top10.loc[j, 'Rank 10'] = word[9]

In [23]:
df_similarity_tweetberita_top10.head(3)

Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,tadalafil,pengalamn,adalagi,adala,amygdala,psi,kognisi,apsi,kspsi,vsi
1,laku,melakukkan,melakukaan,imelakukan,dillakukan,dilakuin,melakulan,melakuakan,melakuka,dilakulan,kelakuanya
2,indonesia,indonesianis,indonesi,indonesian,indonesianisme,indonesias,indonesien,nesoindonesia,indonesiancloud,indonesiabermutu,indoneia


In [24]:
constuctFastText.save_to_csv(df_similarity_tweetberita_top10, '../data/data_preprocessed/corpus_fasttext_topnrank/tweet_berita/df_similartiy_top10_unigram_bigram_trigram.csv')
print(f'done!')

done!


## BUILDING CORPUS TWEET

In [25]:
corpus_twitter = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')
corpus_twitter = create_ngrams(corpus_berita, 3)

In [26]:
constuctFastText = BuildFasttext(corpus_twitter, 'unigram-bigram-trigram')
model = constuctFastText.build_fastText()



In [27]:
model_tweet = model.wv

In [28]:
model_tweet.most_similar("ensiklopedi")

[('ortopedi', 0.7615773677825928),
 ('hipertensi', 0.7476847767829895),
 ('defensif', 0.7394968271255493),
 ('suspensi', 0.737450897693634),
 ('forensik', 0.7295366525650024),
 ('subspesialis', 0.7289777398109436),
 ('ofensif', 0.728118896484375),
 ('efesien', 0.7235876321792603),
 ('efesiensi', 0.7227770090103149),
 ('ensiklopedia', 0.7211304306983948)]

In [29]:
# Get the vocabulary as a set
vocab = set(model_tweet.key_to_index.keys())
print(len(vocab))
# Get the list of keys
keys = model_tweet.index_to_key
# print(keys)

617766


In [30]:
words = list(keys)
df_similarity_tweet = pd.DataFrame({'Words': words})
print(f'jumlah kata : {len(df_similarity_tweet)}')
print(f'data frame : {df_similarity_tweet.shape}')
df_similarity_tweet.head(3)

jumlah kata : 617766
data frame : (617766, 1)


Unnamed: 0,Words
0,dalam
1,laku
2,indonesia


In [31]:
df_similarity_tweettop10 = df_similarity_tweet.copy()
df_similarity_tweettop10 = constuctFastText.build_rank_column(df_similarity_tweettop10, 10)

In [32]:
print(f'df similarity top10 : {df_similarity_tweettop10.columns}')
df_similarity_tweettop10.head(3)

df similarity top10 : Index(['Words', 'Rank 1', 'Rank 2', 'Rank 3', 'Rank 4', 'Rank 5', 'Rank 6',
       'Rank 7', 'Rank 8', 'Rank 9', 'Rank 10'],
      dtype='object')


Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,,,,,,,,,,
1,laku,,,,,,,,,,
2,indonesia,,,,,,,,,,


In [33]:
for j in range(len(df_similarity_tweettop10['Words'])):
    word = constuctFastText.build_n_rank(df_similarity_tweettop10['Words'][j], model_tweet, 10)
    df_similarity_tweettop10.loc[j, 'Rank 1'] = word[0]
    df_similarity_tweettop10.loc[j, 'Rank 2'] = word[1]
    df_similarity_tweettop10.loc[j, 'Rank 3'] = word[2]
    df_similarity_tweettop10.loc[j, 'Rank 4'] = word[3]
    df_similarity_tweettop10.loc[j, 'Rank 5'] = word[4]
    df_similarity_tweettop10.loc[j, 'Rank 6'] = word[5]
    df_similarity_tweettop10.loc[j, 'Rank 7'] = word[6]
    df_similarity_tweettop10.loc[j, 'Rank 8'] = word[7]
    df_similarity_tweettop10.loc[j, 'Rank 9'] = word[8]
    df_similarity_tweettop10.loc[j, 'Rank 10'] = word[9]

In [34]:
df_similarity_tweettop10.head(3)

Unnamed: 0,Words,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7,Rank 8,Rank 9,Rank 10
0,dalam,amygdala,tadalafil,adala,afdal,amdal,tidal,andalas,apsi,sil,adalagi
1,laku,melakukkan,melakukaan,dillakukan,imelakukan,dilakuka,dilakuakan,dilakulan,melakuka,melakuakan,melakuakn
2,indonesia,indonesianis,indonesi,indonesian,indonesien,indonesias,indonesianisme,nesoindonesia,indonesiancloud,indoneia,indonesiabermutu


In [35]:
constuctFastText.save_to_csv(df_similarity_tweettop10, '../data/data_preprocessed/corpus_fasttext_topnrank/tweet/df_similartiy_top10_unigram_bigram_trigram.csv')