In [15]:
# === Step X: Merge Label Berdasarkan Kesamaan Teks (text_clean vs TEXT) ===
import pandas as pd
from rapidfuzz import process, fuzz


In [18]:

# === Konfigurasi File ===
PRE_FILE = '/Users/ahmadzaki/Downloads/Skripsi Zaki/CodeSkripsi/01.Cleaning/data/kai_preprocessed_1434row_stepwise.csv'
LABEL_FILE = '/Users/ahmadzaki/Downloads/Skripsi Zaki/CodeSkripsi/01.Cleaning/data/kai_full_label_FIB.xlsx'
OUTPUT_FILE = '/Users/ahmadzaki/Downloads/Skripsi Zaki/CodeSkripsi/01.Cleaning/data/FINALBGT_kai_matched_by_textlabel.csv'

# === Step 1: Load Data ===
pre_df = pd.read_csv(PRE_FILE, dtype=str)
label_df = pd.read_excel(LABEL_FILE, dtype=str)

# Normalisasi nama kolom
label_df.columns = [c.strip().lower() for c in label_df.columns]
pre_df.columns = [c.strip().lower() for c in pre_df.columns]

# Ambil kolom penting
pre_sub = pre_df[['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text','text_clean']].copy()
label_sub = label_df[['text', 'label']].copy()

# Bersihkan teks (hapus spasi ganda, lowercase)
pre_sub['text_clean_norm'] = pre_sub['text_clean'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()
label_sub['text_norm'] = label_sub['text'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

print(f"Jumlah data preprocessed: {len(pre_sub)}")
print(f"Jumlah data label: {len(label_sub)}")

Jumlah data preprocessed: 1434
Jumlah data label: 1434


In [19]:
# === Step 2: Matching Berdasarkan Kemiripan Teks ===
matches = []
for i, text in enumerate(pre_sub['text_clean_norm']):
    if i % 200 == 0:
        print(f"Proses baris ke-{i}/{len(pre_sub)} ...")
    match = process.extractOne(text, label_sub['text_norm'], scorer=fuzz.token_sort_ratio)
    if match and match[1] >= 90:  # ambil yang mirip >=90%
        matched_text, score, idx = match
        label_value = label_sub.iloc[idx]['label']
    else:
        label_value = None
    matches.append(label_value)

# Tambahkan hasil label ke preprocessed dataset
pre_sub['label'] = matches

# === Step 3: Simpan Output ===
matched_count = pre_sub['label'].notna().sum()
print(f"\nTweet berhasil dicocokkan label: {matched_count} / {len(pre_sub)} ({matched_count/len(pre_sub)*100:.2f}%)")

output_cols = ['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text', 'label']
pre_sub[output_cols].to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')

print(f"\n✅ File output disimpan ke: {OUTPUT_FILE}")
pre_sub[output_cols].head(10)

Proses baris ke-0/1434 ...
Proses baris ke-200/1434 ...
Proses baris ke-400/1434 ...
Proses baris ke-600/1434 ...
Proses baris ke-800/1434 ...
Proses baris ke-1000/1434 ...
Proses baris ke-1200/1434 ...
Proses baris ke-1400/1434 ...

Tweet berhasil dicocokkan label: 1434 / 1434 (100.00%)

✅ File output disimpan ke: /Users/ahmadzaki/Downloads/Skripsi Zaki/CodeSkripsi/01.Cleaning/data/FINALBGT_kai_matched_by_textlabel.csv


Unnamed: 0,id_str,created_at,user_id_str,conversation_id_str,full_text,label
0,1608764220451213313,Fri Dec 30 09:57:34 +0000 2022,230609460,1608764220451213313,udah beli tiket di aplikasi kaiaccess dan saat...,NEGATIF
1,1608631716184489986,Fri Dec 30 01:11:02 +0000 2022,1158718283870138368,1608631716184489986,@KAI121 kenapa gabisa pake ovo buat pembayaran...,NEGATIF
2,1608263340564058116,Thu Dec 29 00:47:15 +0000 2022,1250731212471074816,1608263340564058116,@KAI121 min saya mau ubah no. Hp yang ada di k...,NETRAL
3,1608235238366535680,Wed Dec 28 22:55:35 +0000 2022,120352418,1608227080503955458,Waktunya menikmati pemandangan sepanjang perja...,POSITIF
4,1608049638111850496,Wed Dec 28 10:38:04 +0000 2022,967360502652223489,1608049638111850496,@KAI121 kenapa KAIAccess dari kemaren susah di...,NEGATIF
5,1607954768332492802,Wed Dec 28 04:21:05 +0000 2022,1010877687932440577,1607954768332492802,@KAI121 halo saya abis beli tiket kereta di ap...,NEGATIF
6,1607943108934914049,Wed Dec 28 03:34:45 +0000 2022,71733078,1607943108934914049,@KAI121 min aplikasi kaiaccess lagi ngadat kah...,NEGATIF
7,1607879185586061312,Tue Dec 27 23:20:45 +0000 2022,438072886,1607879185586061312,ada yang tau jam buka loket stasiun sidoarjo b...,NEGATIF
8,1607755461062950912,Tue Dec 27 15:09:07 +0000 2022,414602951,1607755461062950912,Halo @KAI121 apps nya lagi under constructions...,NEGATIF
9,1607603636284325890,Tue Dec 27 05:05:49 +0000 2022,359810843,1607603636284325890,@KAI121 Hallo @KAI121 kenapaa app KAIaccess ti...,NEGATIF
