# Preprocessing Text  
Terdapat 7 tahapan dalam preprocessing text yakni:  
1. Case Folding  
2. Remove Special Characters
3. Menghapus Angka
4. Menghapus Tanda Baca
5. Typo Correction
6. Tokenizing 
7. Stopword
8. Stemming

## Import Library

In [1]:
import re
import string
import pandas as pd
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import emoji

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANDIK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANDIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANDIK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv('data/data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  15000 non-null  object
 1   score    15000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 234.5+ KB


In [2]:
# Inisialisasi SpellChecker untuk Bahasa Inggris
spell = SpellChecker()

# Fungsi untuk spell checking menggunakan pyspellchecker
def correct_text_spellchecker(text):
    words = text.split()
    corrected_words = [spell.correction(word) for word in words]
    return ' '.join(corrected_words)

# Fungsi untuk normalisasi menggunakan kamus custom
def normalize_custom_kamus(text, kamus_dict):
    return ' '.join([kamus_dict.get(word, word) for word in text.split()])

# Fungsi untuk menghapus emoji
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')


# --- Fungsi Preprocessing Lengkap ---
def preprocess(text, kamus_custom):
    # 6. Normalisasi Kata (Menggunakan Kamus Custom)
    text = normalize_custom_kamus(text, kamus_custom)
    
    # 1. Case Folding
    text = text.lower()
    
    # 2. Remove Emoji
    text = remove_emoji(text)

    # 3. Remove Special Characters
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # 4. Menghapus Angka
    text = re.sub(r'\d+', '', text)
    
    # 5. Menghapus Tanda Baca
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 6. Normalisasi Kata (Menggunakan Kamus Custom)
    text = normalize_custom_kamus(text, kamus_custom)

    # 7. Spell Checking
    text = correct_text_spellchecker(text)
    
    # 8. Tokenizing
    tokens = word_tokenize(text)
    
    # 9. Stopword Removal
    stopword_list = stopwords.words('english')
    tokens = [word for word in tokens if word not in stopword_list]
    
    # 10. Lemmatization 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Fungsi untuk memuat kamus custom dari file
def load_custom_dictionary(filepath):
    custom_dict = {}
    with open(filepath, encoding='utf-8') as file:
        for line in file:
            if '=' in line:
                key, value = line.strip().split('=', 1)
                custom_dict[key.strip()] = value.strip()
    return custom_dict

# Load kamus custom
kamus_custom = load_custom_dictionary('kamus_custom_en.txt')

In [3]:
ulasan = "this game is amazing pls fix the lag pls lol LoL Lol LOL thx Thx THX"
# ulasan = correct_text_spellchecker(ulasan)
# print(ulasan)
processed_text = preprocess(ulasan, kamus_custom)
print(processed_text)

game amazing please fix lag please laughing loud league legend laughing loud laughing loud thanks thanks thanks


In [None]:
# Preprocess the first 10 rows of the 'content' column
data_sample = data.iloc[:10].copy()  # Copy the first 10 rows for preprocessing
data_sample['processed_content'] = data_sample['content'].apply(preprocess)

# Display the processed content one by one
for index, row in data_sample.iterrows():
    print(f"Row {index}: {row['processed_content']}")

In [None]:
# Membaca file CSV
data = pd.read_csv('data/data.csv')

# Looping untuk mencetak isi kolom 'content'
for index, content in enumerate(data['content']):
    print(f"Row {index}: {content}")