# 1. Persiapan Data

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import emoji
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
data = pd.read_csv('Tugas 1_NIM Genap_Modul 4.csv')

In [None]:

print(data.head())
print(data.info())

  label                                               text
0  REAL  Payal has accused filmmaker Anurag Kashyap of ...
1  FAKE  A four-minute-long video of a woman criticisin...
2  FAKE  Republic Poll, a fake Twitter account imitatin...
3  REAL  Delhi teen finds place on UN green list, turns...
4  REAL  Delhi: A high-level meeting underway at reside...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3729 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3729 non-null   object
 1   text    3721 non-null   object
dtypes: object(2)
memory usage: 58.4+ KB
None


# 2. Data Cleaning

In [None]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    # Konversi ke huruf kecil
    text = str(text).lower()
    
    # Menghapus emoticon
    text = emoji.replace_emoji(text, replace='')
    
    # Mengganti tanda baca seperti - dengan spasi
    text = text.replace('-', ' ')
    
    # Menghapus karakter non-alfanumerik kecuali spasi
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Menghapus URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Menghapus angka
    text = re.sub(r'\d+', '', text)
    
    # Tokenisasi
    tokens = word_tokenize(text)
    
    # Menghapus stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Menggabungkan kembali token
    text = ' '.join(tokens)
    
    # Menghapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Menerapkan pembersihan pada kolom teks
data['cleaned_text'] = data['text'].apply(clean_text)

# Memeriksa hasil pembersihan
print(data[['text', 'cleaned_text']].head())

# 3. Train Test Split

In [None]:
# Mengencode label (REAL=1, FAKE=0)
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Memisahkan fitur dan label
X = data['cleaned_text']
y = data['label_encoded']

# Membagi data menjadi train dan test (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Memeriksa distribusi data
print(f"Ukuran X_train: {X_train.shape}")
print(f"Ukuran X_test: {X_test.shape}")
print(f"Distribusi label di y_train: {np.bincount(y_train)}")
print(f"Distribusi label di y_test: {np.bincount(y_test)}")

# 4. Vektorisasi Teks

In [None]:
# Inisialisasi TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit dan transform pada data pelatihan
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform pada data pengujian
X_test_tfidf = tfidf.transform(X_test)

# Memeriksa bentuk matriks
print(f"Bentuk X_train_tfidf: {X_train_tfidf.shape}")
print(f"Bentuk X_test_tfidf: {X_test_tfidf.shape}")

# 5. Model Naive Bayes

In [None]:
# Inisialisasi model Naive Bayes
nb_model = MultinomialNB()

# Melatih model
nb_model.fit(X_train_tfidf, y_train)

# Prediksi pada data pengujian
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluasi model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Akurasi Naive Bayes: {accuracy_nb:.4f}")
print("\nClassification Report Naive Bayes:")
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))
print("\nConfusion Matrix Naive Bayes:")
print(confusion_matrix(y_test, y_pred_nb))

# 6. Model ANN

In [None]:
# Mengubah data ke format dense untuk ANN
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Membangun model ANN
ann_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Kompilasi model
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Melatih model
history = ann_model.fit(X_train_dense, y_train, epochs=10, batch_size=32, 
                       validation_data=(X_test_dense, y_test), verbose=1)

# Evaluasi model
loss, accuracy_ann = ann_model.evaluate(X_test_dense, y_test)
print(f"Akurasi ANN: {accuracy_ann:.4f}")

# Prediksi untuk classification report
y_pred_ann = (ann_model.predict(X_test_dense) > 0.5).astype(int)
print("\nClassification Report ANN:")
print(classification_report(y_test, y_pred_ann, target_names=label_encoder.classes_))
print("\nConfusion Matrix ANN:")
print(confusion_matrix(y_test, y_pred_ann))

# 7. Contoh Prediksi

In [None]:
# Teks bebas untuk prediksi
sample_text = "A viral video claims that the government is hiding the truth about a new disease outbreak in the capital."

# Membersihkan teks
cleaned_sample = clean_text(sample_text)

# Vektorisasi teks
sample_tfidf = tfidf.transform([cleaned_sample])

# Prediksi dengan Naive Bayes
prediction = nb_model.predict(sample_tfidf)
predicted_label = label_encoder.inverse_transform(prediction)[0]

print(f"Teks: {sample_text}")
print(f"Prediksi: {predicted_label}")