<a href="https://colab.research.google.com/github/asrilyusufharahap/Portofolio-Asril/blob/main/UTS_NLP_ASRIL_YUSUF_HARAHAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer

import spacy

# Download resource NLTK (bisa di-comment setelah berhasil)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab') # Added this line to download the missing resource

# Untuk spaCy, pastikan model ini sudah di-download di terminal:
# python -m spacy download en_core_web_sm


# ======================
# 1. TEXT PREPROCESSING
# ======================

print("==== NOMOR 1: TEXT PREPROCESSING ====\n")

text = "Pendidikan adalah kunci utama menuju kesuksesan, meskipun tantangan selalu datang."

# a. Case folding
text_lower = text.lower()
print("1.a Case Folding:")
print(text_lower)
print()

# b. Tokenization
tokens = word_tokenize(text_lower)
print("1.b Tokenization:")
print(tokens)
print()

# c. Stopword removal (bahasa Indonesia)
stop_words = set(stopwords.words('indonesian'))
tokens_no_sw = [t for t in tokens if t.isalpha() and t not in stop_words]
print("1.c Setelah Stopword Removal:")
print(tokens_no_sw)
print()

# d. Stemming dengan SnowballStemmer bahasa Indonesia
# SnowballStemmer does not support 'indonesian'. For Indonesian stemming, consider libraries like Sastrawi.
# stemmer_id = SnowballStemmer("indonesian")
# stems = [stemmer_id.stem(t) for t in tokens_no_sw]
# print("1.d Hasil Stemming:")
# print(stems)
print("\n\n")


# ============================================
# 2. TOKENIZATION MENGGUNAKAN PUSTAKA spaCy
# ============================================

print("==== NOMOR 2: TOKENIZATION DENGAN spaCy ====\n")

# Pakai model kosong multibahasa, cukup untuk tokenisasi
nlp_tokenizer = spacy.blank("xx")

text2 = "Pendidikan adalah kunci utama menuju kesuksesan, meskipun tantangan selalu datang."
doc2 = nlp_tokenizer(text2)

tokens_spacy = [token.text for token in doc2]

print("2. Hasil Tokenization spaCy:")
for t in tokens_spacy:
    print(t)
print("\n\n")


# ============================================
# 3. POS TAGGING DENGAN spaCy + OUTPUT TAG
# ============================================

print("==== NOMOR 3: POS TAGGING ====\n")

# Load model bahasa Inggris untuk POS Tagging
nlp_pos = spacy.load("en_core_web_sm")

text3 = "Natural language processing helps computers understand human language clearly."
doc3 = nlp_pos(text3)

print("3. Hasil POS Tagging (kata, POS, tag):")
for token in doc3:
    print(f"{token.text:12} {token.pos_:8} {token.tag_:8}")
print("\n\n")


# =========================================================
# 4. NAMED ENTITY RECOGNITION (NER) DARI TEKS BERITA PENDEK
# =========================================================

print("==== NOMOR 4: NAMED ENTITY RECOGNITION (NER) ====\n")

# Pakai model yang sama (en_core_web_sm)
nlp_ner = nlp_pos

news_text = (
    "Indonesia's central bank kept interest rates steady on Tuesday "
    "as inflation stayed within the target range, Governor Budi said in Jakarta."
)

doc4 = nlp_ner(news_text)

print("4. Teks berita:")
print(news_text)
print("\n4. Hasil NER (Entitas dan Jenisnya):")
for ent in doc4.ents:
    print(f"Entitas: {ent.text:20}  Label: {ent.label_}")
print("\n\n")


# ====================================================
# 5. PERBANDINGAN STEMMING DAN LEMMATIZATION (PYTHON)
# ====================================================

print("==== NOMOR 5: STEMMING vs LEMMATIZATION ====\n")

words = ["studies", "studying", "better", "runs", "children"]

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print(f"{'Kata':12} {'Stem (Porter)':15} {'Lemma':12}")
for w in words:
    stem = porter.stem(w)
    lemma = lemmatizer.lemmatize(w)  # default: noun
    print(f"{w:12} {stem:15} {lemma:12}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


==== NOMOR 1: TEXT PREPROCESSING ====

1.a Case Folding:
pendidikan adalah kunci utama menuju kesuksesan, meskipun tantangan selalu datang.

1.b Tokenization:
['pendidikan', 'adalah', 'kunci', 'utama', 'menuju', 'kesuksesan', ',', 'meskipun', 'tantangan', 'selalu', 'datang', '.']

1.c Setelah Stopword Removal:
['pendidikan', 'kunci', 'utama', 'kesuksesan', 'tantangan']




==== NOMOR 2: TOKENIZATION DENGAN spaCy ====

2. Hasil Tokenization spaCy:
Pendidikan
adalah
kunci
utama
menuju
kesuksesan
,
meskipun
tantangan
selalu
datang
.



==== NOMOR 3: POS TAGGING ====

3. Hasil POS Tagging (kata, POS, tag):
Natural      ADJ      JJ      
language     NOUN     NN      
processing   NOUN     NN      
helps        VERB     VBZ     
computers    NOUN     NNS     
understand   VERB     VB      
human        ADJ      JJ      
language     NOUN     NN      
clearly      ADV      RB      
.            PUNCT    .       



==== NOMOR 4: NAMED ENTITY RECOGNITION (NER) ====

4. Teks berita:
Indonesia'