In [None]:
%pip install pandas
%pip install openpyxl
%pip install nltk
%pip install Sastrawi
%pip install swifter

In [None]:
import pandas as pd

data = pd.read_excel('hasil/dataset.xlsx')
data.head()

Case Folding

In [None]:
data['cf'] = data['text'].str.lower()
data['cf'] = data['cf'].astype(str).replace(r'[^a-zA-z0-9\s]', '', regex=True)

data.head()

Cleaning 

In [None]:
import re

def clean_tweet(text):
    text = re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)|([^\x00-\x7F]+)", " ", text)
    text = re.sub(r"\d+", "", text)
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

data['clean'] = data['cf'].apply(clean_tweet)
data.head()


Tokenasi

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['token'] = data['clean'].apply(word_tokenize_wrapper)
data.head()

Normaliasi

In [None]:

# Load normalisasi
normalizad_word = pd.read_excel("dataset/normalisasi.xlsx")
normalizad_word_dict = {row[0]: row[1] for _, row in normalizad_word.iterrows()}

# Normalisasi term
def normalized_term(document):
    normalized = []
    failed_terms = []
    for term in document:
        try:
            normalized.append(normalizad_word_dict[term])
        except KeyError:
            failed_terms.append(term)
            normalized.append(term)
    return normalized, failed_terms

# Normalisasi
data[['normalisasi', 'gagal_normalisasi']] = pd.DataFrame(data['token'].apply(normalized_term).tolist(), index=data.index)

# Menghitung presentase kegagalan
jumlah_gagal = data['gagal_normalisasi'].apply(lambda x: 1 if len(x) > 0 else 0).sum()
total_data = len(data)
presentase_gagal = (jumlah_gagal / total_data) * 100

print("Total kegagalan:", jumlah_gagal)
print("Total data:", total_data)
presentase_kegagalan = f"{presentase_gagal:.2f} %"
print("Presentase kegagalan:", presentase_kegagalan)
data.head()

In [None]:
gagal_normalisasi = set()

for value in data['gagal_normalisasi']:
    if isinstance(value, list) and value:
        gagal_normalisasi.update(value)

filtered_df = pd.DataFrame(list(gagal_normalisasi), columns=['gagal_normalisasi'])

#simpan data emoticon yang tidak bisa klasifikasi
filtered_df.to_excel('gagal_proses/gagal_normalisasi.xlsx', index=False)

#hapus kolom unprocessable_emoticons tidak butuh
data.drop('gagal_normalisasi', axis=1, inplace=True)

Stopwords

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
list_stopwords = stopwords.words('indonesian')
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]
data['stopwords'] = data['normalisasi'].apply(stopwords_removal)
data.head()


stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd

try:
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
except FileNotFoundError:
    print("File dataset untuk proses stemming tidak ditemukan.")

def stem(teks):
    try:
        text = [stemmer.stem(word) for word in teks]
        return text, []
    except AttributeError:
        return [], teks

data[['stemming', 'gagal_stemming']] = pd.DataFrame(data['stopwords'].apply(stem).tolist(), index=data.index)

# Menghitung presentase kegagalan
jumlah_gagal_stemming = (data['gagal_stemming'].apply(len) > 0).sum()
total_data_stemming = len(data)
presentase_gagal_stemming = (jumlah_gagal_stemming / total_data_stemming) * 100

print("Total kegagalan stemming:", jumlah_gagal_stemming)
print("Total data stemming:", total_data_stemming)
presentase_kegagalan = f"{presentase_gagal_stemming:.2f} %"
print("Presentase kegagalan:", presentase_kegagalan)
data.head()


In [None]:
gagal_stemming = set()

for value in data['gagal_stemming']:
    if isinstance(value, list) and value:
        gagal_stemming.update(value)

filtered_df = pd.DataFrame(list(gagal_stemming), columns=['gagal_stemming'])

#simpan data emoticon yang tidak bisa klasifikasi
filtered_df.to_excel('gagal_proses/gagal_stemming.xlsx', index=False)

#hapus kolom unprocessable_emoticons tidak butuh
data.drop('gagal_stemming', axis=1, inplace=True)

In [None]:
import ast

data['text_join'] = data['stemming'].apply(str)

def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])

data["text_join"] = data['text_join'].apply(join_text_list)

data.to_excel('hasil/hasil_preposesing.xlsx', index=False)
