In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix
import nltk


## FEATURE EXPANSION WITH TF-IDF
Copyright @I Gde Bagus Janardana Abasan
Telkom University
1301190061

In [2]:
class FeatureExpansion:
    def __init__(self, dataset, corpus_fasttext_nrank):
        self.features_v_tfidf = None
        self.vector_tfidf = None
        self.tf_idf_data = None
        self.dataset = dataset
        self.vector = []
        self.features_v = []
        self.data_list_tokenized = list(self.dataset['preprocess_token'])
        self.corpus_fasttext_nrank = corpus_fasttext_nrank
        self.dictionary_combined_fasttext = None
        self.fe_vec = []
        self.tf_idf_df = pd.DataFrame()
        self.tf_idf_vec = []
        self.binary_vectorizer = None
        self.binary_vectorizer_dataframe = []
        self.binary_vectorizer_dataframe_fe = []
        self.fe_BINARYVEC_df = pd.DataFrame()
        self.tf_idf_vec = []

    def datasetToBinaryVector(self):
        """
        """
        vectorizer = CountVectorizer(binary=True, ngram_range=(1,3), max_features=15000)
        self.binary_vectorizer = vectorizer.fit_transform(self.dataset["preprocess_final"])
        self.binary_vectorizer_dataframe  = pd.DataFrame(self.binary_vectorizer.toarray(),columns=vectorizer.get_feature_names_out())
        self.binary_vectorizer_dataframe_fe = self.binary_vectorizer_dataframe.copy()
        self.features_v = vectorizer.get_feature_names_out()
        self.vector = self.binary_vectorizer_dataframe.values.tolist()
        self.fe_vec = self.vector.copy()
        self.dataset['Binary_vec'] = self.vector

        return self.dataset

    def binaryVectorToTFIDF(self, filename):

        #define tf-idf
        self.tf_idf_vec = TfidfTransformer()
        binary = self.fe_BINARYVEC_df.values
        matrix = csr_matrix(binary)
        self.tf_idf_data = self.tf_idf_vec.fit_transform(matrix)

        #create dataframe
        self.tf_idf_df  = pd.DataFrame(self.tf_idf_data.toarray(), columns=self.features_v)
        self.tf_idf_df['label final'] = self.dataset['label_final']
        self.tf_idf_df.to_csv(filename, index=False)

        return self.tf_idf_df

    def buildCombinedCorpusDictionary(self, rank):
        """
        Untuk pembuatan combined corpus fasttext dictionary yang berisi kata-kata yang ada di corpus fasttext
        :param rank:
        """
        if rank == 1:
            self.corpus_fasttext_nrank['combined_top1'] = self.corpus_fasttext_nrank.apply(lambda x: list([x['Rank 1']]),axis=1)
            self.dictionary_combined_fasttext = dict((Words, combined) for Words, combined in zip(self.corpus_fasttext_nrank.Words, self.corpus_fasttext_nrank.combined_top1))
        if rank == 5:
            self.corpus_fasttext_nrank['combined_top5'] = self.corpus_fasttext_nrank.apply(lambda x: list([x['Rank 1'], x['Rank 2'], x['Rank 3'], x['Rank 4'], x['Rank 5']]),axis=1)
            self.dictionary_combined_fasttext = dict((Words, combined) for Words, combined in zip(self.corpus_fasttext_nrank.Words, self.corpus_fasttext_nrank.combined_top5))
        if rank == 10:
            self.corpus_fasttext_nrank['combined_top10'] = self.corpus_fasttext_nrank.apply(lambda x: list([x['Rank 1'], x['Rank 2'], x['Rank 3'], x['Rank 4'], x['Rank 5'],  x['Rank 6'], x['Rank 7'], x['Rank 8'], x['Rank 9'], x['Rank 10']]),axis=1)
            self.dictionary_combined_fasttext = dict((Words, combined_top10) for Words, combined_top10 in zip(self.corpus_fasttext_nrank.Words, self.corpus_fasttext_nrank.combined_top10))


    def checkWords(self, i, j):
        """
        Untuk pengecekan apakah kata-kata yang ada di corpus fasttext ada di corpus T
        :param i adalah index dari self.vector:
        :param j adalah index dari self.vector[i] ibaratnya vector[i][j]:
        :return:
        """
        try:
            # pengecekan if one word in W appears in corpus T fasttext
            a = [t for t in self.dictionary_combined_fasttext[self.features_v[j]] if t in self.data_list_tokenized[i]]
            return a
        except:
            return []

    def weightCheck(self, w, i, j):
        """
        Untuk pengecekan weight tf-idf dari kata-kata yang ada di corpus fasttext
        :param w:
        :param i:
        :param j:
        :return:
        """

        try:
            if w != "":
                idx = np.where(self.features_v == w)[0]
                if len(idx) > 0:
                    idx = idx[0]
                    if self.vector[i][idx] != 0:
                        print(f'kata {self.features_v[j]} memiliki arti identik dengan kata {w} sehingga nilai vektor diubah menjadi {self.vector[i][idx]}')
                    return self.vector[i][idx]
                else:
                    # Tindakan yang diambil jika idx tidak ditemukan
                    # Misalnya, mengembalikan nilai default atau melakukan operasi khusus
                    return 0  # Mengembalikan nilai 0 jika idx tidak ditemukan
            else:
                return self.vector[i][j]
        except:
            return self.vector[i][j]


    def callFeatureExpansion(self):
        print()
        print('============ FEATURE EXPANSION IS PROCESSING ============')
        binary_matrix = self.binary_vectorizer.toarray()
        for i in range(len(self.vector)):
            for j in range(len(self.vector[i])):
                if self.vector[i][j] == 1:
                    word_list = self.checkWords(i,j)
                    for value_word in word_list:
                        expanded_value = [self.weightCheck(value_word, i, j)]
                        idx = np.where(self.features_v == value_word)[0]
                        if len(idx) > 0:
                            self.fe_vec[i][idx[0]] = expanded_value[0]
                        else:
                            pass
        self.dataset['BinaryVector_FE_VEC'] = self.fe_vec
        self.fe_BINARYVEC_df = pd.DataFrame(self.fe_vec, columns=self.features_v)
        return self.fe_BINARYVEC_df

## BERITA

In [3]:
# LOAD CORPUS
corpus_similarity_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext_topnrank/berita/df_similarity_top10_unigram_bigram_trigram.csv')
print(f'shape berita :', corpus_similarity_berita.shape)

shape berita : (617766, 11)


In [4]:
# LOAD DATASET
dataset_hatespeech = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')
print(f'shape dataset :', dataset_hatespeech.shape)

shape dataset : (49841, 11)


In [5]:
expansion = FeatureExpansion(dataset_hatespeech, corpus_similarity_berita)
# lakukan TF-IDF vectorize terhadap dataset
df_countVectorizer = expansion.datasetToBinaryVector()
df_countVectorizer['Binary_vec']

0        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                               ...                        
49836    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49837    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49838    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49839    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49840    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: Binary_vec, Length: 49841, dtype: object

In [6]:
#build corpus fasttext dictionary top1
expansion.buildCombinedCorpusDictionary(1)

In [7]:
# lakukan feature expansion
df_fe = expansion.callFeatureExpansion()
df_fe


kata alqur memiliki arti identik dengan kata qur sehingga nilai vektor diubah menjadi 1
kata bukti memiliki arti identik dengan kata buktiin sehingga nilai vektor diubah menjadi 1
kata buktiin memiliki arti identik dengan kata bukti sehingga nilai vektor diubah menjadi 1
kata teror memiliki arti identik dengan kata teroris sehingga nilai vektor diubah menjadi 1
kata teror memiliki arti identik dengan kata teroris sehingga nilai vektor diubah menjadi 1
kata membicarakan memiliki arti identik dengan kata bicara sehingga nilai vektor diubah menjadi 1
kata nabi ibrahim memiliki arti identik dengan kata ibrahim sehingga nilai vektor diubah menjadi 1
kata nenek moyang memiliki arti identik dengan kata nenek sehingga nilai vektor diubah menjadi 1
kata masakan memiliki arti identik dengan kata makan sehingga nilai vektor diubah menjadi 1
kata nonmuslim memiliki arti identik dengan kata muslim sehingga nilai vektor diubah menjadi 1
kata agam memiliki arti identik dengan kata ragam sehingga nil

MemoryError: Unable to allocate 5.57 GiB for an array with shape (49841, 15000) and data type object

In [None]:
# transfor to tf-idf
df_tfidf = expansion.binaryVectorToTFIDF('../data/data_preprocessed/hasil_ekspansi/berita/DatasetHatespeech_UnigramBigramTrigram_Top1_ExpandedBerita_15000.csv')
df_tfidf

## TWEET

In [None]:
# LOAD CORPUS
corpus_similarity_tweet = pd.read_csv('../data/data_preprocessed/corpus_fasttext_topnrank/tweet/df_similarity_top10_unigram_bigram_trigram.csv')
print(f'shape berita :', corpus_similarity_tweet.shape)

In [None]:
# LOAD DATASET
dataset_hatespeech = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')
print(f'shape dataset :', dataset_hatespeech.shape)

In [None]:
expansion = FeatureExpansion(dataset_hatespeech, corpus_similarity_tweet)
# lakukan TF-IDF vectorize terhadap dataset
df_countVectorizer = expansion.datasetToBinaryVector()
df_countVectorizer['Binary_vec']

In [None]:
#build corpus fasttext dictionary top1
expansion.buildCombinedCorpusDictionary(1)

In [None]:
# lakukan feature expansion
df_fe = expansion.callFeatureExpansion()
df_fe

In [None]:
# transfor to tf-idf
df_tfidf = expansion.binaryVectorToTFIDF('../data/data_preprocessed/hasil_ekspansi/twitter/DatasetHatespeech_UnigramBigramTrigram_Top1_ExpandedTwitter_15000.csv')
df_tfidf

## TWEET-BERITA

In [None]:
# LOAD CORPUS
corpus_similarity_berita = pd.read_csv('../data/data_preprocessed/corpus_fasttext_topnrank/berita/df_similarity_top10_unigram_bigram_trigram.csv')
corpus_similarity_tweet = pd.read_csv('../data/data_preprocessed/corpus_fasttext_topnrank/tweet/df_similarity_top10_unigram_bigram_trigram.csv')

corpus_similarity_merged = pd.concat([corpus_similarity_berita,corpus_similarity_tweet])

In [None]:
# LOAD DATASET
dataset_hatespeech = pd.read_csv('../data/data_preprocessed/dataset/DatasetHateSpeech_Final_TA2023.csv')
print(f'shape dataset :', dataset_hatespeech.shape)

In [None]:
expansion = FeatureExpansion(dataset_hatespeech, corpus_similarity_merged)
# lakukan TF-IDF vectorize terhadap dataset
df_countVectorizer = expansion.datasetToBinaryVector()
df_countVectorizer['Binary_vec']

In [None]:
#build corpus fasttext dictionary top1
expansion.buildCombinedCorpusDictionary(1)

In [None]:
# lakukan feature expansion
df_fe = expansion.callFeatureExpansion()
df_fe

In [None]:
# transfor to tf-idf
df_tfidf = expansion.binaryVectorToTFIDF('../data/data_preprocessed/hasil_ekspansi/twitterberita/DatasetHatespeech_UnigramBigramTrigram_Top1_ExpandedBeritaTwitter_15000.csv')
df_tfidf