**Filtering Dataset**

In [1]:
!pip install pandas langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=5f53dd46efbfb28b82266fca695a0ff22c0e54a907df20dd267cb4e3c83d9ab9
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/DataMining/Dataset_Kinerja Pemerintah.csv')
print(df.columns.tolist())

['created_at', 'full_text', 'tweet_url', 'user_id_str', 'username']


In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from datetime import datetime
import os

# untuk hasil deteksi bahasa yang konsisten
DetectorFactory.seed = 0

# --- Konfigurasi ---/content/
input_file = '/content/drive/MyDrive/DataMining/Dataset_Kinerja Pemerintah.csv'
output_file = '/content/drive/MyDrive/DataMining/Dataset_Kinerja Pemerintah_filtered.csv'
log_file = '//content/drive/MyDrive/DataMining/Datasetfiltering_log_batch1.txt'

# --- Load data ---
df = pd.read_csv(input_file)

# asumsi kolom teks namanya 'tweet'
if 'full_text' not in df.columns:
    raise ValueError("Pastikan kolom teks bernama 'full_text' ada di CSV")

total_awal = len(df)

# --- 1. Hapus duplikat ---
df = df.drop_duplicates(subset='full_text')
setelah_duplikat = len(df)

# --- 2. Hapus tweet terlalu pendek (<3 kata) ---
df = df[df['full_text'].apply(lambda x: len(str(x).split()) >= 3)]
setelah_pendek = len(df)

# --- 3. Hapus tweet non-Bahasa Indonesia ---
def is_indonesian(text):
    try:
        return detect(text) == 'id'
    except:
        return False

df = df[df['full_text'].apply(is_indonesian)]
setelah_bahasa = len(df)

# --- Simpan hasil filter ---
df.to_csv(output_file, index=False)

# --- Catat log ---
with open(log_file, 'w', encoding='utf-8') as f:
    f.write(f"Filtering Data Batch 1 - {datetime.now()}\n")
    f.write(f"Total awal: {total_awal}\n")
    f.write(f"Setelah hapus duplikat: {setelah_duplikat}\n")
    f.write(f"Setelah hapus tweet pendek: {setelah_pendek}\n")
    f.write(f"Setelah hapus non-Bahasa Indonesia: {setelah_bahasa}\n")
    f.write(f"Total akhir: {len(df)}\n")
    f.write("File hasil: " + output_file + "\n")

print("Filtering selesai.")
print(f"Data awal: {total_awal}, Data akhir: {len(df)}")


In [None]:
import pandas as pd

# baca file hasil filter lengkap
df = pd.read_csv('/content/drive/MyDrive/DataMining/Dataset_Kinerja Pemerintah_filtered.csv')

# pilih kolom yang dibutuhkan untuk labeling
df_label = df[['full_text', 'username', 'created_at']].copy()

# tambah kolom label kosong
df_label['label'] = ''

# simpan file siap-labeling
df_label.to_csv('/content/drive/MyDrive/DataMining/Dataset_labeling.csv', index=False)

print("File siap-labeling (dengan created_at) sudah dibuat.")


Final_Label

In [None]:
import pandas as pd
from collections import Counter

# Ganti path sesuai lokasi file kamu di Drive
path = '/content/drive/MyDrive/DataMining/Dataset_labeling - Dataset_labeling.csv.csv'

df = pd.read_csv(path)

def majority_vote(row):
    labels = [row['label1'], row['label2'], row['label3']]
    return Counter(labels).most_common(1)[0][0]

df['final_label'] = df.apply(majority_vote, axis=1)

# Simpan hasil ke Drive
df.to_csv('/content/drive/MyDrive/DataMining/Dataset_Final_labeling.csv', index=False)
print(df.head())


In [None]:
import pandas as pd

# Baca file
df = pd.read_csv("/content/drive/MyDrive/DataMining/Dataset_Final_labeling.csv")  # ganti path kalau perlu

# Normalisasi kolom jadi lowercase + hapus spasi berlebih
df['final_label'] = df['final_label'].astype(str).str.strip().str.lower()

# Hitung jumlah tiap label
label_counts = df['final_label'].value_counts()

# Tampilkan hasil
print("Jumlah masing-masing label:")
print(label_counts)

# (opsional) tampilkan persentase
print("\nPersentase masing-masing label:")
print((label_counts / label_counts.sum() * 100).round(2))


EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DataMining/Dataset_Final_labeling.csv")
df.head()

In [None]:
df.info()
df.describe()
df['final_label'].value_counts()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=df['final_label'])
plt.title('Distribusi Kategori (Positif, Negatif)')
plt.xlabel('final_label')
plt.ylabel('Jumlah Data')
plt.tight_layout()
plt.show()

In [None]:
# Word Cloud Keseluruhan
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Gabungkan semua teks dari dataset final
all_text = " ".join(df['full_text'].astype(str))

# Setting WordCloud (background putih)
wc = WordCloud(
    width=1000,
    height=600,
    background_color='white',
    max_words=300
).generate(all_text)

# Visualisasi
plt.figure(figsize=(12,7))
plt.imshow(wc, interpolation='bilinear')
plt.title("Word Cloud Keseluruhan Dataset (Semua Label)", fontsize=16)
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Pisahkan teks berdasarkan label
text_Positif = " ".join(df[df['final_label']=="Positif"]['full_text'])
text_Negatif = " ".join(df[df['final_label']=="Negatif"]['full_text'])

# Setting WordCloud warna putih
wc_settings = {
    "width": 800,
    "height": 400,
    "background_color": "white",
}

wc_Positif = WordCloud(**wc_settings).generate(text_Positif)
wc_Negatif = WordCloud(**wc_settings).generate(text_Negatif)

# Plot dalam satu baris
plt.figure(figsize=(20,6))

plt.subplot(1,3,1)
plt.imshow(wc_Positif, interpolation='bilinear')
plt.title("Word Cloud - Positif")
plt.axis('off')

plt.subplot(1,3,2)
plt.imshow(wc_Negatif, interpolation='bilinear')
plt.title("Word Cloud - Negatif")
plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Agregasi Tweet per Bulan

import pandas as pd
import matplotlib.pyplot as plt

# Pastikan kolom tanggal bernama 'created_at' (ganti jika beda)
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Hapus baris yang tanggalnya tidak valid
df = df.dropna(subset=['created_at'])

# Buat kolom "year_month" untuk agregasi per bulan
df['year_month'] = df['created_at'].dt.to_period('M')

# Hitung jumlah tweet per bulan
monthly_count = df.groupby('year_month').size().reset_index(name='count')

# Convert period → datetime agar bisa di-plot
monthly_count['year_month'] = monthly_count['year_month'].dt.to_timestamp()

# Plot line chart
plt.figure(figsize=(12,5))
plt.plot(monthly_count['year_month'], monthly_count['count'], linewidth=3)
plt.title("Tren Jumlah Tweet tentang kinerja pemerintah")
plt.xlabel("Bulan")
plt.ylabel("Jumlah Tweet")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Pre Proses

In [None]:
!pip install Sastrawi wordcloud

import pandas as pd
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
import pandas as pd

# Ganti dengan hasil copy path dari Google Drive
file_path = '/content/drive/MyDrive/DataMining/Dataset_Final_labeling.csv'

df = pd.read_csv(file_path)
df.head()


In [None]:
text_col = None
for col in df.columns:
    if "text" in col.lower() or "tweet" in col.lower() or "komentar" in col.lower() or "full" in col.lower():
        text_col = col
        break

if not text_col:
    text_col = df.select_dtypes(include=["object"]).columns[0]

print("Kolom teks terpakai:", text_col)

raw_text = df[text_col].dropna().astype(str)

In [None]:
!pip install Sastrawi

In [None]:
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopwords = set(StopWordRemoverFactory().get_stop_words())

def cleaning(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # hapus URL
    text = re.sub(r"@\w+", "", text)  # hapus mention
    text = re.sub(r"#\w+", "", text)  # hapus hashtag
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # hapus punctuation
    text = re.sub(r"\d+", "", text)  # hapus angka
    text = re.sub(r"\s+", " ", text).strip()  # hapus spasi berlebih
    return text

df["clean_text"] = raw_text.apply(cleaning)

# Hapus stopwords
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stopwords])

df["clean_text"] = df["clean_text"].apply(remove_stopwords)


In [4]:
#WorldCloud Setelah Preproses

from wordcloud import WordCloud
import matplotlib.pyplot as plt

text_cleaned = " ".join(df["clean_text"].tolist())

wc = WordCloud(width=2000, height=1000, background_color="white").generate(text_cleaned)

plt.figure(figsize=(14,7))
plt.imshow(wc, interpolation='bilinear')
plt.title("WordCloud Setelah Preprocessing", fontsize=16)
plt.axis("off")
plt.show()

wc.to_file("wordcloud_after_preprocessing.png")
print("WordCloud after save:", "wordcloud_after_preprocessing.png")

KeyError: 'clean_text'

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
tokens = text_cleaned.split()
counter = Counter(tokens)
top20 = counter.most_common(20)

words, counts = zip(*top20)
plt.figure(figsize=(10,7))
plt.barh(words[::-1], counts[::-1])
plt.title("Top 20 Kata Setelah Preprocessing")
plt.xlabel("Frekuensi Kata")
plt.show()


In [None]:
df.to_csv("Dataset_after_preproses_fix.csv", index=False, encoding="utf-8")
print("Dataset cleaned berhasil disimpan → Dataset_after_preposes.csv")

In [None]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["final_label"]  # pastikan kolom label sudah ada

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Skenario 1 - Logistic Regression (TF-IDF)" : LogisticRegression(max_iter=200),
    "Skenario 2 - SVM Linear (TF-IDF)" : LinearSVC(),
    "Skenario 3 - Multinomial NB (TF-IDF)" : MultinomialNB(),
    "Skenario 4 - Random Forest (BoW)" : RandomForestClassifier(),
    "Skenario 5 - Decision Tree (BoW)" : DecisionTreeClassifier()
}

Skenario

In [None]:
!pip install scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# ganti path ini dengan path Google Drive Anda
df = pd.read_csv("/content/drive/MyDrive/DataMining/Dataset_after_preproses_fix.csv")

df.head()

In [None]:
X = df['clean_text']
y = df['final_label']

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(class_weight='balanced', max_iter=2000)
model_lr.fit(X_train, y_train)

pred_lr = model_lr.predict(X_test)

print("=== Logistic Regression ===")
print(classification_report(y_test, pred_lr))

In [None]:
from sklearn.svm import LinearSVC

model_svm = LinearSVC(class_weight='balanced')
model_svm.fit(X_train, y_train)

pred_svm = model_svm.predict(X_test)

print("=== SVM ===")
print(classification_report(y_test, pred_svm))


In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(class_weight='balanced', n_estimators=300)
model_rf.fit(X_train, y_train)

pred_rf = model_rf.predict(X_test)

print("=== Random Forest ===")
print(classification_report(y_test, pred_rf))


In [None]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

pred_nb = model_nb.predict(X_test)

print("=== Naive Bayes ===")
print(classification_report(y_test, pred_nb))


In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=7)
model_knn.fit(X_train, y_train)

pred_knn = model_knn.predict(X_test)

print("=== KNN ===")
print(classification_report(y_test, pred_knn))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

In [None]:
X = df["clean_text"]
y = df["final_label"]


In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "SVM (LinearSVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=300),
    "Naive Bayes": MultinomialNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

In [None]:
results = []

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average="weighted", zero_division=0)
    rec = recall_score(y_test, pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, pred, average="weighted", zero_division=0)

    results.append([name, acc, prec, rec, f1])

    print(classification_report(y_test, pred))

In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
results_df_sorted = results_df.sort_values(by="F1", ascending=False)

print("\n\n=== HASIL URUT PERFORMA TERBAIK (BERDASARKAN F1) ===")
print(results_df_sorted)

In [None]:
best_model_name = results_df_sorted.iloc[0]["Model"]
best_f1 = results_df_sorted.iloc[0]["F1"]

print("\n\n=== MODEL TERBAIK ===")
print(f"Model terbaik: {best_model_name} (F1-score: {best_f1:.4f})")

In [5]:
!git config --global user.name "aryaduta662"
!git config --global user.email "aryaduta662@gmail.com"

In [30]:
!git clone https://ghp_816438@github.com/aryaduta662/final_project_streamlit.git

Cloning into 'final_project_streamlit'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 5 (delta 1), reused 5 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (5/5), 27.66 KiB | 1.20 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [31]:
!git remote set-url origin https://ghp_816438@github.com/aryaduta662/final_project_streamlit.git


In [39]:
!git remote -v

origin	https://ghp_816438@github.com/aryaduta662/final_project_streamlit.git (fetch)
origin	https://ghp_816438@github.com/aryaduta662/final_project_streamlit.git (push)


In [32]:
%cd final_project_streamlit

/content/final_project_streamlit/final_project_streamlit/final_project_streamlit/final_project_streamlit


In [33]:
!ls /content

drive  final_project_streamlit	sample_data


In [34]:
!cp "/content/drive/MyDrive/Colab Notebooks/FPDatmin.ipynb" .

In [35]:
ls

app.py  Dataset_after_preproses_fix.csv  datasetlabel.csv  FPDatmin.ipynb


In [36]:
!find /content/drive/MyDrive -name "FPDatmin.ipynb"

/content/drive/MyDrive/Colab Notebooks/FPDatmin.ipynb


In [37]:
!git push origin main


fatal: could not read Password for 'https://ghp_816438@github.com': No such device or address


In [38]:
!git status
!git add FPDatmin.ipynb
!git commit -m "add notebook FPDatmin"
!git push origin main

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mFPDatmin.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)
[main 2cfda60] add notebook FPDatmin
 1 file changed, 1 insertion(+)
 create mode 100644 FPDatmin.ipynb
fatal: could not read Password for 'https://ghp_816438@github.com': No such device or address
