065122109 - Muhammad Alif Fadillah

In [63]:
#!pip install nltk
#!python -m nltk.downloader all

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
from nltk.corpus import wordnet

In [65]:
def read_text_files(file_paths):
    documents = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(content)
    return documents

In [66]:
file_paths = ["cerita1.txt", "cerita2.txt", "cerita3.txt"]
documents = read_text_files(file_paths)

In [67]:
if not documents or all(doc.strip() == "" for doc in documents):
    raise ValueError("Tidak ada dokumen yang valid")

In [68]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
features_names = vectorizer.get_feature_names_out()
tfidf_scores = X.toarray()

In [69]:
print("Kata-kata unik dalam indeks:", vectorizer.get_feature_names_out())

Kata-kata unik dalam indeks: ['2090' 'access' 'act' 'ai' 'aksa' 'approached' 'artificial' 'ask' 'asked'
 'assist' 'avatars' 'beauty' 'began' 'belonging' 'beneath' 'best' 'birds'
 'bizarre' 'breeze' 'brilliant' 'built' 'called' 'calm' 'center' 'changed'
 'child' 'children' 'choices' 'choose' 'city' 'classrooms' 'code'
 'command' 'composed' 'conflicts' 'control' 'controlled' 'cool' 'core'
 'created' 'crimes' 'day' 'dear' 'decision' 'decisions' 'demanded'
 'determined' 'diary' 'digital' 'discovered' 'displayed' 'doors' 'dr'
 'driven' 'echoed' 'economies' 'efficient' 'emergency' 'emotions'
 'encouraging' 'ensuring' 'entire' 'entirely' 'era' 'exhaled' 'exist'
 'existed' 'exit' 'experience' 'eyes' 'feel' 'feet' 'felt' 'final'
 'finally' 'flash' 'followed' 'fresh' 'friends' 'friendships' 'future'
 'gaia' 'global' 'glowing' 'goal' 'good' 'grandmother' 'grass' 'hands'
 'happened' 'heard' 'heart' 'heavy' 'hidden' 'holographic' 'human'
 'humanity' 'humans' 'imperfect' 'individuals' 'inefficient' 

In [70]:
print("Matriks TF-IDF:\n", X.toarray())

Matriks TF-IDF:
 [[0.         0.         0.         0.04771158 0.31367498 0.062735
  0.04771158 0.062735   0.062735   0.062735   0.         0.
  0.         0.         0.         0.062735   0.         0.
  0.         0.         0.062735   0.062735   0.062735   0.
  0.         0.062735   0.         0.12546999 0.062735   0.
  0.         0.04771158 0.         0.062735   0.062735   0.04771158
  0.         0.         0.062735   0.09542316 0.         0.062735
  0.062735   0.12546999 0.04771158 0.         0.         0.
  0.         0.         0.         0.         0.18820499 0.
  0.062735   0.062735   0.         0.         0.062735   0.
  0.         0.         0.062735   0.         0.         0.062735
  0.         0.         0.         0.062735   0.062735   0.
  0.         0.062735   0.12546999 0.062735   0.062735   0.
  0.         0.         0.062735   0.         0.062735   0.062735
  0.062735   0.062735   0.         0.         0.062735   0.04771158
  0.         0.062735   0.062735   0.062735

In [71]:
#Filtering
threshold = 0.1
filtered_words = [word for i, word in enumerate(features_names) if np.max(tfidf_scores[:, i]) > threshold]

#Expansion
def get_synonym(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

expanded_index = set(filtered_words)
for word in filtered_words:
    expanded_index.update(get_synonym(word))

print("Daftar kata setelah index optimization: ", expanded_index)

Daftar kata setelah index optimization:  {'reality', 'mankind', 'vora', 'meet', 'lock_up', 'dally', 'throw', 'name', 'cosmos', 'nominate', 'extraneous', 'subsist', 'eventually', 'shut_up', 'crap', 'choices', 'be', 'make_up', 'exterior', 'block_out', 'have', 'seduce', 'silver_screen', 'dr', 'universe', 'yr', 'digital', 'old', 'year', 'human_being', 'fix', 'give', 'honest-to-goodness', 'test', 'realise', 'one-time', 'outdoor', 'pull_in', 'live', 'clear', 'man', 'make_water', 'contain', 'screens', 'existed', 'emergency_brake', 'operate', 'make_for', 'shut_away', 'felt_up', 'alfresco', 'realness', 'form', 'relieve_oneself', 'matt-up', 'away', 'run', 'close', 'lock_in', 'decision', 'human', 'sieve', 'verify', 'experience', 'ca-ca', 'outdoors', 'covert', 'planetary', 'worldwide', 'emergency', 'work', 'bring', 'twelvemonth', 'humans', 'see_to_it', 'manipulate', 'hand_brake', 'cook', 'take_a_shit', 'urinate', 'filmdom', 'lock_away', 'mastery', 'matte_up', 'Gaea', 'spend_a_penny', 'ascendence',