## **1. Setup**

### **1.1. Install Packages**

In [None]:
!pip install black[jupyter]
!pip install imbalanced-learn
!pip install nltk
!pip install PySastrawi
!pip install fasttext
!pip install -U gensim
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz
!gunzip cc.id.300.bin.gz
!pip install -U symspellpy
!wget https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bx2
!pip install Sastrawi
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
!gunzip cc.id.300.vec.gz
!pip install wordcloud
!pip install bpemb
!pip install keras

### **1.2. Import Libraries**

In [None]:
# Scraping Data
import requests as req
from bs4 import BeautifulSoup as bs
from datetime import datetime
import csv
from google.colab import data_table, drive
import pandas as pd
hades = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}

# Import Dataset
import os
import pickle
import random
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import data_table, drive

# Text Preprocessing
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Word Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gzip
from urllib.request import urlopen

# Oversampling
from imblearn.over_sampling import BorderlineSMOTE

# Pemodelan Bi-LSTM
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from keras.models import load_model

### **1.3. Colab Configuration**

In [None]:
# mount Drive
drive.mount("/content/drive")

Mounted at /content/drive


### **1.4. Using GPU for Training**

In [None]:
# check Tensorflow version
print("TensorFlow version: " + tf.__version__)

# check PyTorch version
print("PyTorch version: " + torch.__version__)

# get the GPU device name
device_name = tf.test.gpu_device_name()

# the device name should look like the following
if device_name != "/device:GPU:0":
    print(
        "\n\nThis error most likely means that this notebook is not "
        "configured to use a GPU.  Change this in Notebook Settings via the "
        "command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n"
    )
    raise SystemError("GPU device not found")
print("Found GPU at: {}".format(device_name))

## **2. Scraping Data**

### **2.1. CNN**

In [None]:
def scrape_cnn(start_page, end_page):
    global hades
    a = 1
    for page in range(start_page, end_page + 1):
        url = f'https://www.cnnindonesia.com/pemilu/indeks/616/{page}'
        ge = req.get(url, hades).text
        sop = bs(ge, 'lxml')
        li = sop.find('div', class_='flex flex-col gap-5')
        lin = li.find_all('article', class_='flex-grow')
        for x in lin:
            link = x.find('a')['href']
            headline = x.find('a').find('h2').text
            ge_ = req.get(link, hades).text
            sop_ = bs(ge_, 'lxml')
            date = sop_.find('div', class_='text-cnn_grey text-sm mb-4')
            content = sop_.find_all('div', class_='detail-text text-cnn_black text-sm grow min-w-0')
            for x in content:
                x = x.find_all('p')
                y = [y.text for y in x]
                content_ = ''.join(y).replace('\n', '').replace('ADVERTISEMENT', '').replace(
                    'SCROLL TO CONTINUE WITH CONTENT', '')
                print(f'done[{a}] > {headline[0:10]}')
                a += 1
                with open('cnn_pemilu.csv', 'a') as file:
                    wr = csv.writer(file, delimiter=',')
                    wr.writerow([headline, date, link, content_])

In [None]:
scrape_cnn(131, 340) # Sesuaikan dengan halaman awal dan akhir yang ingin di-scrape

### **2.2. Kompas**

In [None]:
def scrape_kompas(start_page, end_page):
    global hades
    a = 1  # definisikan variabel a di sini
    data = []

    for page in range(start_page, end_page + 1):
        url = f'https://pemilu.kompas.com/news?page={page}'

        try:
            ge = req.get(url, hades).text
            sop = bs(ge, 'lxml')
            li = sop.find('div', class_='list')
            lin = li.find_all('a', class_='listLink display-flex')

            for x in lin:
                link = x.get('href', '')  # Get the href attribute or default to an empty string
                if not link or link.startswith('#'):
                    # Skip empty or invalid URLs (e.g., URLs starting with '#')
                    continue

                ge_ = req.get(link, headers=hades).text
                sop_ = bs(ge_, 'lxml')
                headline = sop_.find('h1', class_='read__title').text.strip()
                date = sop_.find('div', class_='read__time').text.replace('WIB', '').replace('Kompas.com - ', '').strip()

                content = sop_.find_all('div', class_='clearfix')
                filtered_content = []
                for x in content:
                    paragraphs = x.find_all('p')
                    for p in paragraphs:
                        if 'ADVERTISEMENT' not in p.text and 'SCROLL TO RESUME CONTENT' not in p.text:
                            clean_text = ' '.join(str(item) for item in p.contents if not isinstance(item, Tag))
                            filtered_content.append(clean_text)

                content_text = ''.join(filtered_content).replace('\n', '')

                data.append({'Title': headline, 'Date': date, 'Link': link, 'Content': content_text})
                print(f'done[{a}] > {headline[0:20]}')
                a += 1

        except Exception as e:
            # Tangani kesalahan dengan mencetak pesan dan melanjutkan ke iterasi berikutnya
            print(f"Error: {e}")
            continue

    df = pd.DataFrame(data)
    csv_filename = dataset_path + 'kompas_pemilu2.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data berhasil disimpan ke dalam {csv_filename}.")

In [None]:
scrape_kompas(701, 999)  # Sesuaikan dengan halaman awal dan akhir yang ingin di-scrape

### **2.3. Turnbackhoax**

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_turnbackhoax(keyword, start_page=1, end_page=float('inf')):
    titles = []
    contents = []
    dates = []

    page_num = start_page
    while page_num <= end_page:
        link = requests.get(f"https://turnbackhoax.id/page/{page_num}/?s={keyword}")
        soup = BeautifulSoup(link.text, "html.parser")

        # Extracting subpage links
        subpage_links = [a['href'] for a in soup.select('.mh-loop-content h3 > a')]

        if not subpage_links:
            break  # Keluar dari loop jika tidak ada subpage lagi

        for subpage_link in subpage_links:
            # Navigate to the subpage
            subpage = requests.get(subpage_link)
            subpage_soup = BeautifulSoup(subpage.text, "html.parser")

            # Extract title, date, and content from the subpage
            title = subpage_soup.select_one('.entry-title')
            date = subpage_soup.select_one('.entry-meta-date.updated')
            content = subpage_soup.select_one('.entry-content')

            if title and content and date:
                titles.append(title.text.strip())
                contents.append(content.text.strip())
                dates.append(date.text.strip())

        page_num += 1

    df = pd.DataFrame({"Title": titles, "Date": dates, "Content": contents})
    csv_filename = f'turnbackhoax_kampanye2.csv' #UBAH NAMA FILE
    df.to_csv(csv_filename, index=False)
    print(f"Data berhasil disimpan ke dalam {csv_filename}.")

In [None]:
scrape_turnbackhoax("pemilu", 5, 8) # Sesuaikan dengan keyword, halaman awal dan akhir yang ingin di-scrape

## **3. Load Dataset**

### **3.1. Import Dataset**

In [None]:
# import dataset
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/'
# Memuat semua DataFrame
df_news = pd.read_csv(dataset_path + 'ALL_NEWS2.csv', encoding='ISO-8859-1')
df_news.drop("Unnamed: 0", axis=1, inplace=True)  # Menghapus kolom "Unnamed: 0" secara in-place
print(df_news)

### **3.2. Show Data on Chart**

#### Histogram

In [None]:
pd.value_counts(df_news["Label"]).plot.bar()
plt.title("Label Comparison")
plt.xlabel("Label")
plt.ylabel("Count")

# if you want to save this chart, uncomment line below
# plt.savefig('label_histogram', dpi=300)

df_news["Label"].value_counts()

## **4. Text Preprocessing**

In [None]:
# import dataset
df_raw = dataset_path + 'ALL_NEWS2.csv'
df_raw = pd.read_csv(df_raw, encoding='ISO-8859-1')
df_raw.drop("Unnamed: 0", axis=1, inplace=True)  # Menghapus kolom "Unnamed: 0" secara in-place
df_raw.rename({"Content": "Raw_Content"}, axis=1, inplace=True)
print(df_raw)

### **4.1. Data Cleaning**

In [None]:
def data_cleaning(text):
    url_regex = "((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+)||(http\S+))"

    text = re.sub("REFERENSI", "", text)  # remove sentence
    text = re.sub("referensi", "", text)  # remove sentence
    text = re.sub("Referensi", "", text)  # remove sentence
    text = re.sub("Copyright 2008 - 2023 PT. Kompas Cyber Media (Kompas Gramedia Digital Group). All Rights Reserved.", "", text)  # remove sentence
    text = re.sub(url_regex, "", text)  # remove every url
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)  # remove character except alphabet, number, & spasi
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # remove character ASCII 127
    text = re.sub(r"\n", " ", text)  # remove every new line '\n'
    text = re.sub(r"@[A-Za-z0-9]+", " ", text)  # remove twitter username
    text = re.sub("@[\w\-]+", " ", text)  # remove mentions
    text = re.sub(" URL", " ", text)  # remove word URL
    text = re.sub(" url", " ", text)  # remove word url
    text = re.sub("\+", " ", text)  # remove backslash
    text = re.sub("\s+", " ", text)  # remove special regular expression character
    text = re.sub("[^0-9a-zA-Z]", " ", text)  # remove punctuation
    text = re.sub("[^a-zA-Z]", " ", text)  # remove numbers
    text = re.sub(" +", " ", text)  # remove extra spaces

    return text

df_news["Cleaning"] = df_news["Content"].apply(data_cleaning)
df_news.Conten = (df_news.Content.str.strip())

df1 = df_raw[["Raw_Content"]]
df2 = df_news[["Cleaning"]]

df_compare = pd.concat([df1, df2], axis=1)
df_compare.head()

### **4.2. Case Folding**

In [None]:
# Case Folding
def case_folding(text):
    text = text.lower()
    return text

df_news["Casefold"] = df_news["Cleaning"].apply(case_folding)

df1 = df_news[["Cleaning"]]
df2 = df_news[["Casefold"]]

df_compare = pd.concat([df1, df2], axis=1)
df_compare.head()

### **4.3. Tokenizing**

In [None]:
# Tokenizing
nltk.download('punkt')  # Download the 'punkt' resource
df_news["Token"] = df_news["Casefold"].apply(nltk.word_tokenize)

df1 = df_news[["Casefold"]]
df2 = df_news[["Token"]]

df_compare = pd.concat([df1, df2], axis=1)

# Set option to display the entire content of the columns
pd.set_option('display.max_colwidth', None)

# Display only the first row of df_compare
df_compare_first_row = df_compare.head(1)
print(df_compare_first_row)

### **4.3. Normalization**

In [None]:
# import key normalization
key_norm = "https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv"
key_norm = pd.read_csv(key_norm, encoding="latin-1", header=None)
key_norm = key_norm.rename(columns={0: "original", 1: "replacement"})

print("Number of Data: ", len(key_norm))
key_norm.head()

In [None]:
key_norm_map = dict(zip(key_norm["original"], key_norm["replacement"]))

def normalize_token(token):
    return key_norm_map.get(token, token)

def normalize_tokens(tokens):
    return [normalize_token(token) for token in tokens]

# Apply Normalization
df_news["Normalize"] = df_news["Token"].apply(normalize_tokens)

df1 = df_news[["Token"]]
df2 = df_news[["Normalize"]]

df_compare = pd.concat([df1, df2], axis=1)
# df_compare.head()

# Set option to display the entire content of the columns
pd.set_option('display.max_colwidth', None)

# Display only the first row of df_compare
df_compare_first_row = df_compare.head(1)
print(df_compare_first_row)

### **4.4. Filtering**

In [None]:
# Initiate NTLK stopword
nltk.download("stopwords")
nltk.download("punkt")

def remove_nltk_stopword(tokens):
    """Remove NLTK Stopword/Filtering."""
    nltk_stopword_dict = set(stopwords.words("indonesian"))
    filtered_tokens = [word for word in tokens if word not in nltk_stopword_dict]
    filtered_tokens = [re.sub("  +", " ", token).strip() for token in filtered_tokens if token]
    return filtered_tokens

# Apply Filtering
df_news["Filter"] = df_news["Normalize"].apply(remove_nltk_stopword)

df1 = df_news["Normalize"]
df2 = df_news["Filter"]

df_compare = pd.concat([df1, df2], axis=1)
# Set option to display the entire content of the columns
pd.set_option('display.max_colwidth', None)

# Display only the first row of df_compare
df_compare_first_row = df_compare.head(1)
print(df_compare_first_row)

### **4.5. Stemming**

In [None]:
# Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(tokens):
    """Stemming."""

    return [stemmer.stem(token) for token in tokens]

# Apply Stemming
df_news["Stem"] = df_news["Filter"].apply(stemming)

df1 = df_news[["Filter"]]
df2 = df_news[["Stem"]]

df_compare = pd.concat([df1, df2], axis=1)
df_compare.head()

In [None]:
# Concatenate the lists of stemmed tokens back into sentences
df_news["Preprocess"] = df_news["Stem"].apply(lambda tokens: ' '.join(tokens))
print(df_news["Preprocess"])

In [None]:
df_news.to_csv(dataset_path + 'dataset_preprocessed_result.csv', index=False)
print("Data berhasil disimpan ke dalam CSV.")

## **5. Finalisasi Preprocessing**

### **5.1. Preview After Preprocessing**

In [None]:
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/'
# df_news = df_news[["Content", "Label"]]
df_news = pd.read_csv(dataset_path + 'dataset_preprocessed_result.csv', encoding='ISO-8859-1')

# Menyimpan kolom yang ingin dipertahankan
selected_columns = ["Sumber", "Title", "Date", "Preprocess", "Label"]

# Memilih hanya kolom yang diinginkan
df_news = df_news[selected_columns]

# Menampilkan DataFrame yang telah diubah
df_news.head()

### **5.2. Label Classification**

In [None]:
def label_classification(news):
    label = ""
    if int(news) == 1:
        label = "Non_Hoax"
    else:
        label = "Hoax"
    return label

# Menambahkan kolom 'Kategori' berdasarkan nilai 'Label' di kolom paling kanan
df_news.insert(len(df_news.columns), 'Category', df_news['Label'].apply(label_classification))

# Rename columns
df_news.rename(columns={"Sumber": "Source"}, inplace=True)
df_news.head()

In [None]:
df_news.to_csv(dataset_path + 'dataset_preprocess_label.csv', index=False)

### 5.3. Data Exploratory

In [None]:
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/'
df_news = pd.read_csv(dataset_path + 'dataset_preprocess_label.csv', encoding='ISO-8859-1')
# df_news.head()

In [None]:
# Download stopwords
nltk.download('stopwords')

# Word Cloud
text = ' '.join(df_news['Preprocess'].astype(str).values)
wordcloud = WordCloud(
    width=3000,
    height=2000,
    background_color='white',
    stopwords=set(stopwords.words("english"))
).generate(text)

# Display the Word Cloud
fig = plt.figure(figsize=(10, 7.5), facecolor='k', edgecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# Menampilkan informasi dasar tentang dataset
print("Info Dataset:")
print(df_news.info())

In [None]:
# Menampilkan panjang teks
df_news['Content_length'] = df_news['Preprocess'].apply(len)
plt.figure(figsize=(6, 4))
sns.histplot(data=df_news, x='Content_length', bins=30)
plt.title('Distribusi Panjang Teks')
plt.xlabel('Panjang Teks')
plt.ylabel('Jumlah')
plt.show()

In [None]:
# Menampilkan Jumlah Data Berdasarkan Kelas
print("Number of Data: ", df_news.shape)

# converting to df and assigning new name to the columns
value_counts = df_news["Label"].value_counts()

df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ["Label", "Counts of Label"]  # change columns name
df_value_counts

In [None]:
# Menampilkan Jumlah Data Berdasarkan Sumber Berita
source_value_counts = df_news["Source"].value_counts()

df_source_counts = pd.DataFrame(source_value_counts)
df_source_counts = df_source_counts.reset_index()
df_source_counts.columns = ["Source", "Counts of Source"]  # change columns name
df_source_counts = df_source_counts.sort_values(by="Counts of Source", ascending=False)  # sort by counts

print("Number of Data: ", df_news.shape[0])
print("\nDistribution of Data based on 'Source':")
print(df_source_counts)

### **5.3. Remove Duplicate**

In [None]:
df_before = pd.read_csv(dataset_path + 'dataset_preprocess_clear.csv')
# print(df_combined)

In [None]:
# Hapus duplikat berdasarkan kolom 'Content'
df_news = df_before.drop_duplicates(subset=['Preprocess'])

# Tampilkan info setelah menghapus duplikat
print("Jumlah baris sebelum menghapus duplikat:", len(df_before))
print("Jumlah baris setelah menghapus duplikat:", len(df_news))

Jumlah baris sebelum menghapus duplikat: 6074
Jumlah baris setelah menghapus duplikat: 6035


### **5.4. Remove Missing Values**

In [None]:
df_indexed = df_news
print(df_indexed.isnull().sum())

In [None]:
# Menghapus baris yang memiliki missing value pada kolom 'text' dan 'label', lalu reset index
df_indexed = df_indexed.dropna(subset=['text', 'label']).reset_index(drop=True)

# Menampilkan jumlah missing value setelah dihapus
print("\nJumlah missing value setelah dihapus:")
print(df_indexed.isnull().sum())

In [None]:
# Menyimpan DataFrame yang telah diubah ke file CSV
df_indexed.to_csv(dataset_path + 'dataset_preprocess_clear.csv', index=False)
print("Preprocessed Dataset has been downloaded")

## **6. Split Up Dataset**

### **6.1. Import Preprocessed Dataset**

In [None]:
# import preprocessed dataset
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/'
df = pd.read_csv(dataset_path + 'dataset_preprocess_clear.csv')
print("Dataset successfully imported")

### **6.2. Hold-out Validation**

In [None]:
# define dataset
X = df["Preprocess"]
y = df["Label"]

random_state = 42
# split into 80:10:10 ration
X_train, X_rem, y_train, y_rem = train_test_split(
    X, y, train_size=0.8, stratify=y, random_state=random_state
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_rem, y_rem, test_size=0.5, stratify=y_rem, random_state=random_state
)

# describe info about train, valid, and test set
print("Number of Train Dataset: ")
print(y_train.value_counts())

print("\nNumber of Valid Dataset: ")
print(y_valid.value_counts())

print("\nNumber of Test Dataset: ")
print(y_test.value_counts())

df_train = pd.concat([X_train, y_train], axis=1)
df_valid = pd.concat([X_valid, y_valid], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [None]:
df_train.to_csv(dataset_path + 'dataset_training.csv', index=False)
df_valid.to_csv(dataset_path + 'dataset_validation.csv', index=False)
df_test.to_csv(dataset_path + 'dataset_testing.csv', index=False)

## **7. Tokenization & Embedding**

### **7.2. Tokenization**

In [None]:
df_train = pd.read_csv(dataset_path + 'dataset_training_resampled.csv')
df_test = pd.read_csv(dataset_path + 'dataset_testing.csv')
df_valid = pd.read_csv(dataset_path + 'dataset_validation.csv')

In [None]:
# Tokenization using Keras Tokenizer
features = 16670
tokenizer = Tokenizer(num_words=features)
tokenizer.fit_on_texts(df_train["Preprocess"])

word_index = tokenizer.word_index # get all words that the tokenizer knows
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Menyimpan Tokenizer ke dalam file pickle
with open(dataset_path + 'tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Menyimpan word_index ke dalam file JSON
with open(dataset_path + 'word_index.json', 'w') as file:
    json.dump(word_index, file)

In [None]:
# Save padded sequences
np.save(dataset_path + 'X_train_padded.npy', X_train_padded)
np.save(dataset_path + 'X_valid_padded.npy', X_valid_padded)
np.save(dataset_path + 'X_test_padded.npy', X_test_padded)

# Save labels
np.save(dataset_path + 'y_train.npy', y_train.values)
np.save(dataset_path + 'y_valid.npy', y_valid.values)
np.save(dataset_path + 'y_test.npy', y_test.values)

### **7.3. FastText Embedding**

In [None]:
# Load Model
file = gzip.open(urlopen('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz'))
vocab_and_vectors = {} # put words as dict indexes and vectors as words values

for line in file:
  values = line.split()
  word = values[0].decode('utf-8')
  vector = np.asarray(values[1:], dtype='float32')
  vocab_and_vectors[word] = vector

In [None]:
# Menyambungkan data ke pre-trained word embedding
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = vocab_and_vectors.get(word)
    # words that cannot be found will be set to 0
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Example of word embedding matrix for a specific word
sample_word = list(tokenizer.word_index.keys())[1]
print(f"Embedding matrix for word '{sample_word}': \n")
print(embedding_matrix[tokenizer.word_index[sample_word]])

In [None]:
# Menyimpan embedding matrix
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/C4V3/'
np.save(dataset_path + 'embedding_matrix.npy', embedding_matrix)

## **8. Balancing Data with Borderline-SMOTE (only training)**

In [None]:
random_state = 42
# Apply Borderline-SMOTE to training data only
smote = BorderlineSMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train)

In [None]:
print("Pembagian Kelas Sebelum Oversampling (Training Set):")
print("Counts of '0': {}".format(sum(y_train == 0)))
print("Counts of '1': {}".format(sum(y_train == 1)))
print("Total: {}".format(len(y_train)))

print("Pembagian Kelas Setelah Oversampling (Training Set):")
print("Counts of '0': {}".format(sum(y_train_resampled == 0)))
print("Counts of '1': {}".format(sum(y_train_resampled == 1)))
print("Total: {}".format(len(y_train_resampled)))

Pembagian Kelas Sebelum Oversampling (Training Set):
Counts of '0': 162
Counts of '1': 4666
Total: 4828
Pembagian Kelas Setelah Oversampling (Training Set):
Counts of '0': 4666
Counts of '1': 4666
Total: 9332


In [None]:
# Menyimpan X_train_resampled
np.save(dataset_path + 'X_train_resampled.npy', X_train_resampled)

# Menyimpan y_train_resampled
np.save(dataset_path + 'y_train_resampled.npy', y_train_resampled)

## **9. Pemodelan**

### **9.1 Load Data**

In [None]:
# Memuat word_index dari file JSON
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/C4V3/'
with open(dataset_path + 'word_index.json', 'r') as file:
    word_index = json.load(file)

In [None]:
# Tentukan path dataset
dataset_path = '/content/drive/MyDrive/Data-TA/data-final/FIX/C4V3/'

# Load padded sequences
X_train_padded = np.load(dataset_path + 'X_train_padded.npy')
X_valid_padded = np.load(dataset_path + 'X_valid_padded.npy')
X_test_padded = np.load(dataset_path + 'X_test_padded.npy')

# Load labels
y_train = pd.Series(np.load(dataset_path + 'y_train.npy'))
y_valid = pd.Series(np.load(dataset_path + 'y_valid.npy'))
y_test = pd.Series(np.load(dataset_path + 'y_test.npy'))

In [None]:
# Memuat embedding matrix
embedding_matrix = np.load(dataset_path + 'embedding_matrix.npy')

In [None]:
# Memuat X_train_resampled
X_train_resampled = np.load(dataset_path + 'X_train_resampled.npy')
# Memuat y_train_resampled
y_train_resampled = np.load(dataset_path + 'y_train_resampled.npy')

### **9.1 Hyperparameter Tuning**

#### *Learning Rate*

In [None]:
# Daftar Learning Rate yang akan diuji
learning_rates = [0.01, 0.001, 0.0001]

# Inisialisasi variabel untuk menyimpan hasil terbaik
best_accuracy = 0
best_loss = float('inf')
best_learning_rate = None

# Looping untuk hyperparameter tuning
for learning_rate in learning_rates:
    print(f"\nTraining model with learning rate: {learning_rate}")

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300,
                        input_length=X_train_padded.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    es = EarlyStopping(restore_best_weights=True,
                       monitor='val_loss', mode='min',
                       verbose=1, patience=10)

    print(model.summary())

    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        batch_size=32, verbose=1,
                        validation_data=(X_valid_padded, y_valid),
                        callbacks=[es])

    # Evaluate model on validation data
    val_loss, val_accuracy = model.evaluate(X_valid_padded, y_valid)

    # Print validation accuracy
    print(f'Validation Accuracy with learning rate {learning_rate}: {val_accuracy}')

    # Classification report on validation data
    y_val_pred_prob = model.predict(X_valid_padded)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)
    print('\nClassification Report (Validation) :')
    print(classification_report(y_valid, y_val_pred, digits=4))

    # Plot training vs validation accuracy
    plt.figure(figsize=(16, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['accuracy', 'val_accuracy'])
    plt.title(f"Accuracy with learning rate {learning_rate}")

    # Plot training vs validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['loss', 'val_loss'])
    plt.title(f"Loss with learning rate {learning_rate}")

    plt.show()

    # Check if the current model is better than the previous best
    if val_loss < best_loss:
        best_loss = val_loss
        best_accuracy = val_accuracy
        best_learning_rate = learning_rate

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(f"Best Learning Rate: {best_learning_rate}")
print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Best Validation Loss: {best_loss}")

#### *Batch Size*

In [None]:
# Daftar ukuran batch yang akan diuji
batch_sizes = [16, 32, 64]

# Inisialisasi variabel untuk menyimpan hasil terbaik
best_accuracy = 0
best_loss = float('inf')
best_batch_size = None

# Looping untuk hyperparameter tuning
for batch_size in batch_sizes:
    print(f"\nTraining model with Batch Size: {batch_size}")

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300,
                        input_length=X_train_padded.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.0001)  # Tetapkan learning rate, bisa disesuaikan
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    es = EarlyStopping(restore_best_weights=True,
                       monitor='val_loss', mode='min',
                       verbose=1, patience=10)

    print(model.summary())

    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        batch_size=batch_size, verbose=1,
                        validation_data=(X_valid_padded, y_valid),
                        callbacks=[es])

    # Evaluate model on validation data
    val_loss, val_accuracy = model.evaluate(X_valid_padded, y_valid)

    # Print validation accuracy
    print(f'Validation Accuracy with Batch Size {batch_size}: {val_accuracy}')

    # Classification report on validation data
    y_val_pred_prob = model.predict(X_valid_padded)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)
    print('\nClassification Report (Validation) :')
    print(classification_report(y_valid, y_val_pred, digits=4))

    # Plot training vs validation accuracy
    plt.figure(figsize=(16, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['accuracy', 'val_accuracy'])
    plt.title(f"Accuracy with Batch Size {batch_size}")

    # Plot training vs validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['loss', 'val_loss'])
    plt.title(f"Loss with Batch Size {batch_size}")

    plt.show()

    # Check if current model is better than the previous best
    if val_loss < best_loss:
        best_loss = val_loss
        best_accuracy = val_accuracy
        best_batch_size = batch_size

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(f"Best Batch Size: {best_batch_size}")
print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Best Validation Loss: {best_loss}")

#### *Unit BiLSTM*

In [None]:
# Daftar jumlah unit BiLSTM yang akan diuji
units_list = [32, 64, 128]

# Inisialisasi variabel untuk menyimpan hasil terbaik
best_accuracy = 0
best_loss = float('inf')
best_units = None

# Looping untuk hyperparameter tuning
for units in units_list:
    print(f"\nTraining model with BiLSTM units: {units}")

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300,
                        input_length=X_train_padded.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(Bidirectional(LSTM(units)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.0001)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    es = EarlyStopping(restore_best_weights=True,
                       monitor='val_loss', mode='min',
                       verbose=1, patience=10)

    print(model.summary())

    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        batch_size=16, verbose=1,
                        validation_data=(X_valid_padded, y_valid),
                        callbacks=[es])

    # Evaluate model on validation data
    val_loss, val_accuracy = model.evaluate(X_valid_padded, y_valid)

    # Print validation accuracy
    print(f'Validation Accuracy with {units} units: {val_accuracy}')

    # Classification report on validation data
    y_val_pred_prob = model.predict(X_valid_padded)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)
    print('\nClassification Report (Validation) :')
    print(classification_report(y_valid, y_val_pred, digits=4))

    # Plot training vs validation accuracy
    plt.figure(figsize=(16, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['accuracy', 'val_accuracy'])
    plt.title(f"Accuracy with {units} units")

    # Plot training vs validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['loss', 'val_loss'])
    plt.title(f"Loss with {units} units")

    plt.show()

    # Check if current model is better than the previous best
    if val_loss < best_loss:
        best_loss = val_loss
        best_accuracy = val_accuracy
        best_units = units

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(f"Best Number of BiLSTM Units: {best_units}")
print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Best Validation Loss: {best_loss}")

#### *Dropout*

In [None]:
# Daftar jumlah dropout yang akan diuji
dropout_list = [0.3, 0.4, 0.5]

# Inisialisasi variabel untuk menyimpan hasil terbaik
best_accuracy = 0
best_loss = float('inf')
best_dropout = None

# Looping untuk hyperparameter tuning
for dropout_rate in dropout_list:
    print(f"\nTraining model with Dropout rate: {dropout_rate}")

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300,
                        input_length=X_train_padded.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.0001)  # Tetapkan learning rate, bisa disesuaikan
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    es = EarlyStopping(restore_best_weights=True,
                       monitor='val_loss', mode='min',
                       verbose=1, patience=10)

    print(model.summary())

    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        batch_size=16, verbose=1,
                        validation_data=(X_valid_padded, y_valid),
                        callbacks=[es])

    # Evaluate model on validation data
    val_loss, val_accuracy = model.evaluate(X_valid_padded, y_valid)

    # Print validation accuracy
    print(f'Validation Accuracy with Dropout rate {dropout_rate}: {val_accuracy}')

    # Classification report on validation data
    y_val_pred_prob = model.predict(X_valid_padded)
    y_val_pred = (y_val_pred_prob > 0.5).astype(int)
    print('\nClassification Report (Validation):')
    print(classification_report(y_valid, y_val_pred, digits=4))

    # Plot training vs validation accuracy
    plt.figure(figsize=(16, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['accuracy', 'val_accuracy'])
    plt.title(f"Accuracy with Dropout rate {dropout_rate}")

    # Plot training vs validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['loss', 'val_loss'])
    plt.title(f"Loss with Dropout rate {dropout_rate}")

    plt.show()

    # Check if current model is better than the previous best
    if val_loss < best_loss:
        best_loss = val_loss
        best_accuracy = val_accuracy
        best_dropout = dropout_rate

# Print the best hyperparameters
print("\nBest Hyperparameters:")
print(f"Best Dropout Rate: {best_dropout}")
print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Best Validation Loss: {best_loss}")

### **9.2. Training Model**

In [None]:
# Model Definition
model = Sequential()
model.add(Embedding(len(word_index) + 1, 300,
                    input_length=X_train_padded.shape[1], weights=[embedding_matrix],
                    trainable=False))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))

# Compile Model
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics=['accuracy'])

# Define Callbacks
es = EarlyStopping(restore_best_weights=True,
                   monitor='val_loss', mode='min',
                   verbose=1, patience=10)

print(model.summary())

# Fitting Model
history = model.fit(X_train_resampled, y_train_resampled,
                    epochs=100, batch_size=16, verbose=1,
                    validation_data=(X_valid_padded, y_valid),
                    callbacks=[es])

### **9.3. Evaluasi Model**

#### *on Validation Data*

In [None]:
# Evaluate Model (Validation Set)
score = model.evaluate(X_valid_padded, y_valid, verbose=1)
print('Loss and Accuracy Validation: ')
print(str(model.metrics_names[0]) + ": " + str(score[0]) + " | " +
      str(model.metrics_names[1]) + ": " + str(score[1] * 100))

In [None]:
# Classification Report
y_pred = model.predict(X_valid_padded)
y_pred = (y_pred > 0.5).astype(int)
# cm = confusion_matrix(y_valid, y_pred)
# print(cm)

print(classification_report(y_valid, y_pred,
                            digits=4, output_dict=False))
report = classification_report(y_valid, y_pred,
                               digits=4, output_dict=False)

In [None]:
# Plot training vs validation accuracy
plt.figure(figsize=(16, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(['accuracy', 'val_accuracy'])
plt.title(f"Plot Training vs Validation Accuracy")

# Plot training vs validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(['loss', 'val_loss'])
plt.title(f"Plot Training vs Validation Loss")

plt.show()

#### *on Testing Data*

In [None]:
# Evaluate Model (Test Set)
score_test = model.evaluate(X_test_padded, y_test, verbose=1)
print('Loss and Accuracy Testing: ')
print(str(model.metrics_names[0]) + ": " + str(score_test[0]) + " | " +
      str(model.metrics_names[1]) + ": " + str(score_test[1] * 100))

In [None]:
# Classification Report
y_test_pred_prob = model.predict(X_test_padded)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)

report_test = classification_report(y_test, y_test_pred,
                                    digits=4, output_dict=False)
# Showing Classification Report
print('Classification Report - Test Set:')
print(report_test)

In [None]:
# Classification Report on Testing Data
cm_test = confusion_matrix(y_test, y_test_pred)
print(cm_test)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d',
            cmap='Blues', xticklabels=['Berita Hoax', 'Berita Non-Hoax'],
            yticklabels=['Berita Hoax', 'Berita Non-Hoax'])
plt.title('Confusion Matrix - Test Set')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### **9.4. Save Model**

In [None]:
model.save(dataset_path + 'hoax_classifier_FIXEDsmotetrain_p10_model.h5')
tf.keras.models.save_model(model, dataset_path + 'hoax_classifier_FIXEDsmotetrain_p10_model.hdf5')