# Latihan 1: Tokenisasi Teks Indonesia dengan Korpus IndoNLU

In [14]:
# Download dataset IndoNLU SMSA (sentiment analysis)
!wget -P ../data/raw https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv
!wget -P ../data/raw https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv
!wget -P ../data/raw https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv

--2025-09-09 02:21:17--  https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2186718 (2.1M) [text/plain]
Saving to: ‘../data/raw/train_preprocess.tsv.1’


2025-09-09 02:21:17 (28.2 MB/s) - ‘../data/raw/train_preprocess.tsv.1’ saved [2186718/2186718]

--2025-09-09 02:21:17--  https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting 

In [20]:
# Install library yang diperlukan
!pip install --upgrade --no-cache-dir nltk PySastrawi

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import Counter

# Download tokenizer punkt
nltk.download('punkt')
nltk.download("punkt_tab")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [21]:
# Load dataset dengan pandas
train_df = pd.read_csv("../data/raw/train_preprocess.tsv", sep="\t", header=None, names=["text", "label"])
valid_df = pd.read_csv("../data/raw/valid_preprocess.tsv", sep="\t", header=None, names=["text", "label"])
test_df  = pd.read_csv("../data/raw/test_preprocess.tsv",  sep="\t", header=None, names=["text", "label"])

print("Jumlah data train:", len(train_df))
print("Jumlah data valid:", len(valid_df))
print("Jumlah data test :", len(test_df))

# Cek kolom dataset
print("\nKolom dataset:", train_df.columns.tolist())
print(train_df.head())

# Ganti 'sentence' dengan kolom yang benar, biasanya 'text'
sample_texts = train_df['text'][:3].tolist()
labels = train_df['label'][:3].tolist()

# Inisialisasi stemmer
stemmer = StemmerFactory().create_stemmer()

# Tokenisasi & analisis
for i, text in enumerate(sample_texts):
    print(f"\nContoh Teks {i+1} (Label: {labels[i]}):")
    print("Teks Asli:", text)

    # Tokenisasi
    tokens = word_tokenize(text)
    print("Token:", tokens)
    print("Jumlah Token:", len(tokens))
    print("Token Unik:", len(set(tokens)))

    # Stemming
    stemmed_text = stemmer.stem(text)
    stemmed_tokens = word_tokenize(stemmed_text)
    print("Token setelah Stemming:", stemmed_tokens)
    print("Jumlah Token setelah Stemming:", len(stemmed_tokens))

    # Simpan token ke file
    with open(f'tokens_sample_{i+1}.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(tokens))
    print(f"Token disimpan ke tokens_sample_{i+1}.txt")

# Analisis tambahan: frekuensi kata dari contoh teks
all_tokens = []
for text in sample_texts:
    all_tokens.extend(word_tokenize(text.lower()))

word_freq = Counter(all_tokens).most_common(10)
print("\n10 Kata Teratas:", word_freq)

Jumlah data train: 11000
Jumlah data valid: 1260
Jumlah data test : 500

Kolom dataset: ['text', 'label']
                                                text     label
0  warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1  mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2  lokasi strategis di jalan sumatera bandung . t...  positive
3  betapa bahagia nya diri ini saat unboxing pake...  positive
4  duh . jadi mahasiswa jangan sombong dong . kas...  negative

Contoh Teks 1 (Label: positive):
Teks Asli: warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !
Token: ['warung', 'ini', 'dimiliki', 'oleh', 'pengusaha', 'pabrik'

# Latihan 2: Direct access to a prebuilt corpus via NLTK

In [22]:
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('gutenberg')
nltk.download('punkt')

# Load sample text from Gutenberg corpus
text = nltk.corpus.gutenberg.raw('austen-emma.txt')[:1000]  # First 1000 chars of Emma

# Tokenize the text
tokens = word_tokenize(text)

# Print first 20 tokens
print("First 20 tokens:", tokens[:20])

# Basic statistics
print("Total tokens:", len(tokens))
print("Unique tokens:", len(set(tokens)))

First 20 tokens: ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']
Total tokens: 198
Unique tokens: 114


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
