In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np  # Jika diperlukan
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import joblib

In [4]:
# Load dataset gabungan
dataset_path = '/content/drive/MyDrive/dataset/Combined_Dataset.csv'  # Ganti dengan path di Google Drive
data = pd.read_csv(dataset_path)

In [5]:
# Periksa beberapa baris dataset
print(data.head())

# Pastikan tidak ada nilai NaN
data.dropna(subset=['text', 'label'], inplace=True)

# Split data menjadi teks (X) dan label (y)
X = data['text']
y = data['label']


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  ID tanggal judul narasi nama file gambar  
0  December 31, 2017      1 NaN     NaN   NaN    NaN              NaN  
1  December 31, 2017      1 NaN     NaN   NaN    NaN              NaN  
2  December 30, 2017      1 NaN     NaN   NaN    NaN          

cek 1 dan 0

In [6]:
# Cek distribusi label
print(data['label'].value_counts())


label
1    23481
0    21417
Name: count, dtype: int64


cleaning

In [7]:
import re
from nltk.corpus import stopwords
import nltk

# Download stopwords (hanya perlu dilakukan sekali)
nltk.download('stopwords')

# Stopwords
stop_words_id = set(stopwords.words('indonesian'))
stop_words_en = set(stopwords.words('english'))

# Fungsi untuk membersihkan teks
def clean_text(text, language='en'):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    stop_words = stop_words_id if language == 'id' else stop_words_en
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Bersihkan teks berdasarkan label bahasa
data['cleaned_text'] = data.apply(
    lambda row: clean_text(row['text'], 'id') if 'indonesian' in row['text'].lower() else clean_text(row['text'], 'en'),
    axis=1
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


membagi data train-test

In [8]:
from sklearn.model_selection import train_test_split

# Split data menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'], data['label'], test_size=0.25, random_state=42
)

 TF-IDF Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Melatih Model

In [10]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Model Passive Aggressive Classifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_tfidf, y_train)


 Evaluasi Model

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Prediksi
y_pred = model.predict(X_test_tfidf)

# Evaluasi
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5330
           1       0.99      1.00      0.99      5895

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

Accuracy: 0.99
Precision: 0.99
Recall: 1.00
F1 Score: 0.99


menyimpan model

In [12]:
import joblib

# Simpan model dan vectorizer
joblib.dump(model, '/content/drive/MyDrive/Model/model.sav')
joblib.dump(tfidf_vectorizer, '/content/drive/MyDrive/Model/vectorizer.sav')
print("Model dan vectorizer berhasil disimpan.")

Model dan vectorizer berhasil disimpan.
