## PRO PREPROCESSING

In [4]:
import pandas as pd
import re
import string
import nltk
import unicodedata
import emoji
from collections import Counter
from tqdm import tqdm
from nltk.corpus import stopwords, wordnet, sentiwordnet as swn
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load Data
FILE_PATH = "DatasetShopee.csv"
df = pd.read_csv(FILE_PATH)

# Pilih rentang data
START_ROW = 2
END_ROW = 502
df = df.iloc[START_ROW:END_ROW].dropna(subset=['comment'])

# Fungsi normalisasi huruf berulang dengan pengecekan kamus
kamus_kata_baku = {"mantaaaap": "mantap", "gacoooor": "gacor", "bagusss": "bagus"}

# REPEAT X
EXCEPTION_WORDS = {"saya", "kakak", "kuku", "lulus", "aplikasi", "mantap"}
def normalize_repeated_chars(text):
    words = text.split()
    normalized_words = [
        word if word in EXCEPTION_WORDS else re.sub(r'(.)\1+', r'\1', word)
        for word in words
    ]
    return " ".join(normalized_words)

# Fungsi pembersihan teks
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r'[@#][A-Za-z0-9_]+|https?:\/\/\S+', ' ', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = normalize_repeated_chars(text)  # ⬅ Fungsi baru diterapkan di sini
    return text.lower()

# Terapkan preprocessing
tqdm.pandas()
df['comment_clean'] = df['comment'].progress_apply(preprocess_text)

# Stemming dengan Sastrawi
stemmer = StemmerFactory().create_stemmer()
df['comment_clean'] = df['comment_clean'].progress_apply(lambda x: stemmer.stem(x))

# Stopwords Removal dengan pengecualian kata negatif
nltk_stopwords = set(stopwords.words('indonesian'))
sastrawi_stopwords = set(StopWordRemoverFactory().get_stop_words())
additional_stopwords = {"sebuah", "pada", "pun", "bahkan", "oleh", "hanya", "tentang", "ke", "dari", "yang", "ini", "itu", "dengan", "seperti"}

# Kata negatif yang tidak boleh dihapus
negative_words = {"tidak", "bukan", "belum", "jangan", "tanpa"}

# Gabungkan stopwords tetapi tetap mempertahankan kata negatif
all_stopwords = (nltk_stopwords | sastrawi_stopwords | additional_stopwords) - negative_words

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in all_stopwords])

df['comment_clean'] = df['comment_clean'].progress_apply(lambda x: remove_stopwords(x) if x.strip() else "tidak ada komentar")

# Sentimen Lexicon-Based dengan penanganan kata negatif
sentiment_lexicon = {
    'bagus': 1, 'baik': 1, 'puas': 1, 'senang': 1, 'suka': 1,
    'buruk': -1, 'jelek': -1, 'kecewa': -1, 'tidak puas': -1, 'gacor': 1, 'enak': 1,
    'mantap': 1, 'luar biasa': 1, 'mengecewakan': -1, 'parah': -1, 'sampah': -1, 'terbaik': 1
}

def classify_sentiment(text):
    if text == "tidak ada komentar":
        return "Neutral"
    
    words = text.split()
    score = 0
    prev_word = ""

    for word in words:
        if word in negative_words:
            prev_word = word
            continue
        sentiment = sentiment_lexicon.get(word, 0)
        if prev_word and sentiment != 0:
            sentiment *= -1  # Balik sentimen jika ada kata negatif sebelumnya
            prev_word = ""
        score += sentiment

    return "Positive" if score > 0 else "Negative" if score < 0 else "Neutral"

df['sentimen_lexicon'] = df['comment_clean'].progress_apply(classify_sentiment)

# Sentimen dengan SentiWordNet
def translate_to_english(indonesian_word):
    synsets = wordnet.synsets(indonesian_word, lang='ind')
    return synsets[0].lemmas()[0].name() if synsets else indonesian_word

def get_sentiwordnet_score(word):
    english_word = translate_to_english(word)
    if english_word == word:
        return 0
    
    synsets = wordnet.synsets(english_word)
    if not synsets:
        return 0

    synset = synsets[0]
    senti_synset = swn.senti_synset(synset.name())
    return senti_synset.pos_score() - senti_synset.neg_score()

def classify_sentiment_swn(text):
    words = text.split()
    score = sum(get_sentiwordnet_score(word) for word in words)
    return "Positive" if score > 0 else "Negative" if score < 0 else "Neutral"

df['sentimen_swn'] = df['comment_clean'].progress_apply(classify_sentiment_swn)

# Simpan hasil
OUTPUT_CSV = "DatasetShopee_Bersih.csv"
OUTPUT_PARQUET = "DatasetShopee_Bersih.parquet"

df[['username', 'rating', 'comment_clean', 'sentimen_lexicon', 'sentimen_swn']].to_csv(OUTPUT_CSV, index=False)
df[['username', 'rating', 'comment_clean', 'sentimen_lexicon', 'sentimen_swn']].to_parquet(OUTPUT_PARQUET, index=False)

print(f"✅ Preprocessing selesai! Data tersimpan di {OUTPUT_CSV} & {OUTPUT_PARQUET}")

[nltk_data] Downloading package stopwords to C:\Users\M S
[nltk_data]     I\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to C:\Users\M S
[nltk_data]     I\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\M S
[nltk_data]     I\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\M S
[nltk_data]     I\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|██████████| 500/500 [00:00<00:00, 9014.97it/s]
100%|██████████| 500/500 [01:00<00:00,  8.30it/s]
100%|██████████| 500/500 [00:00<00:00, 329637.22it/s]
100%|██████████| 500/500 [00:00<00:00, 420861.33it/s]
100%|██████████| 500/500 [00:04<00:00, 121.58it/s]


✅ Preprocessing selesai! Data tersimpan di DatasetShopee_Bersih.csv & DatasetShopee_Bersih.parquet


## BIGDATA PREPROCESS OPTIMAL

In [None]:
import os
os.environ["MODIN_ENGINE"] = "dask"

import modin.pandas as pd
import re
import string
import unicodedata
import emoji
import nltk
import symspellpy
from tqdm import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import pipeline
import torch

nltk.download("stopwords")

# Load Data
FILE_PATH = "DatasetShopee.csv"
df = pd.read_csv(FILE_PATH)

# Pilih rentang data
START_ROW, END_ROW = 2, 502
df = df.iloc[START_ROW:END_ROW].dropna(subset=['comment']).copy()

# Inisialisasi SymSpell untuk koreksi ejaan
sym_spell = symspellpy.SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("indonesian_words.txt", term_index=0, count_index=1)  # Pastikan file kamus tersedia

# Kamus kata baku dan pengecualian
kamus_kata_baku = {"mantaaaap": "mantap", "gacoooor": "gacor", "bagusss": "bagus"}
EXCEPTION_WORDS = {"saya", "kakak", "kuku", "lulus", "aplikasi", "mantap"}

def normalize_repeated_chars(text):
    words = text.split()
    normalized_words = []
    for word in words:
        if word in EXCEPTION_WORDS:
            normalized_words.append(word)
        else:
            cleaned_word = re.sub(r'(.)\1{2,}', r'\1', word)  # Hapus huruf berulang
            lookup_result = sym_spell.lookup(cleaned_word, symspellpy.Verbosity.CLOSEST, max_edit_distance=2)
            corrected_word = kamus_kata_baku.get(cleaned_word, lookup_result[0].term if lookup_result else cleaned_word)
            normalized_words.append(corrected_word)
    return " ".join(normalized_words)

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r'[@#][A-Za-z0-9_]+|https?:\/\/\S+', ' ', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return normalize_repeated_chars(text).lower()

tqdm.pandas()
df['comment_clean'] = df['comment'].progress_apply(preprocess_text)

# Stemming
stemmer = StemmerFactory().create_stemmer()
df['comment_clean'] = df['comment_clean'].progress_apply(stemmer.stem)

# Stopword Removal
nltk_stopwords = set(nltk.corpus.stopwords.words('indonesian'))
sastrawi_stopwords = set(StopWordRemoverFactory().get_stop_words())
additional_stopwords = {"sebuah", "pada", "pun", "bahkan", "oleh", "hanya", "tentang", "ke", "dari", "yang", "ini", "itu", "dengan", "seperti"}
negative_words = {"tidak", "bukan", "belum", "jangan", "tanpa"}
all_stopwords = (nltk_stopwords | sastrawi_stopwords | additional_stopwords) - negative_words

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in all_stopwords])

df['comment_clean'] = df['comment_clean'].progress_apply(lambda x: remove_stopwords(x) if x.strip() else "tidak ada komentar")

# Sentimen dengan IndoBERT (Optimasi Batch Processing)
device = "cuda" if torch.cuda.is_available() else "cpu"
sentiment_pipeline = pipeline("sentiment-analysis", model="indobenchmark/indobert-lite-base-p1", device=0 if device == "cuda" else -1)

def batch_sentiment(texts, batch_size=64):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        with torch.no_grad():
            batch_results = sentiment_pipeline(batch)
        results.extend([res['label'] for res in batch_results])
    return results

df['sentimen_bert'] = batch_sentiment(df['comment_clean'].tolist())

# Simpan hasil dalam format optimal
OUTPUT_PARQUET = "DatasetShopee_Bersih.parquet"
OUTPUT_FEATHER = "DatasetShopee_Bersih.feather"
df[['username', 'rating', 'comment_clean', 'sentimen_bert']].to_parquet(OUTPUT_PARQUET, compression="snappy", index=False)
df[['username', 'rating', 'comment_clean', 'sentimen_bert']].to_feather(OUTPUT_FEATHER)

print(f"✅ Preprocessing selesai! Data tersimpan di {OUTPUT_PARQUET} & {OUTPUT_FEATHER}")


[nltk_data] Downloading package stopwords to C:\Users\M S
[nltk_data]     I\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
