In [None]:
# Sel 1: Persiapan, Impor, dan Definisi Teks Sumber
import nltk
import string
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Pastikan data NLTK yang diperlukan terunduh
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    print("Mengunduh data NLTK yang diperlukan...")
    nltk.download('punkt')
    nltk.download('stopwords')

# a. Bentuklah sebuah variabel dengan nama contoh_raw
contoh_raw = """Python is an interpreted high-level general-purpose programming language. Its design philosophy emphasizes code readability with its use of significant indentation.
Its language constructs as well as its object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects"""

print("Teks sumber berhasil dimuat.")

Teks sumber berhasil dimuat.


In [12]:
stopWords = set(stopwords.words("english"))
ps = PorterStemmer() 
frequency_matrix = {}

sentences = sent_tokenize(contoh_raw)

for sent in sentences:
    freq_table = {}
    words = word_tokenize(sent)
    
    sent_key = sent[:15]
    
    for word in words:
        word = word.lower()
        
        if all(char in string.punctuation for char in word):
            continue
        
        stemmed_word = ps.stem(word)
        
        if stemmed_word in stopWords:
            continue
        
        freq_table[stemmed_word] = freq_table.get(stemmed_word, 0) + 1
    
    frequency_matrix[sent_key] = freq_table

print("\n--- Hasil Matriks Frekuensi Kata per Kalimat ---")
for key, value in frequency_matrix.items():
    print(f"\nKalimat ({key}...):")
    print(value)


--- Hasil Matriks Frekuensi Kata per Kalimat ---

Kalimat (Python is an in...):
{'python': 1, 'interpret': 1, 'high-level': 1, 'general-purpos': 1, 'program': 1, 'languag': 1}

Kalimat (Its design phil...):
{'design': 1, 'philosophi': 1, 'emphas': 1, 'code': 1, 'readabl': 1, 'use': 1, 'signific': 1, 'indent': 1}

Kalimat (Its language co...):
{'languag': 1, 'construct': 1, 'well': 1, 'object-ori': 1, 'approach': 1, 'aim': 1, 'help': 1, 'programm': 1, 'write': 1, 'clear': 1, 'logic': 1, 'code': 1, 'small': 1, 'large-scal': 1, 'project': 1}


In [15]:
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# --- A. DATA ---
contoh_raw = """Python is an interpreted high-level general-purpose programming language. Its design philosophy emphasizes code readability with its use of significant indentation.
Its language constructs as well as its object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects"""

sentences = sent_tokenize(contoh_raw)
total_documents = len(sentences)

# --- B. FUNGSI-FUNGSI ---

def _create_frequency_matrix(sentences):
    freq_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()
    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            stemmed = ps.stem(word)
            if stemmed in stopWords: continue
            freq_table[stemmed] = freq_table.get(stemmed, 0) + 1
        freq_matrix[sent[:15]] = freq_table
    return freq_matrix

def _create_tf_matrix(freq_matrix):
    tf_matrix = {}
    for sent, f_table in freq_matrix.items():
        tf_table = {}
        count_words = len(f_table) # Sesuai request gambar Anda (menggunakan len)
        for word, count in f_table.items():
            tf_table[word] = count / count_words
        tf_matrix[sent] = tf_table
    return tf_matrix

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}
    for sent, f_table in freq_matrix.items():
        idf_table = {}
        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
        idf_matrix[sent] = idf_table
    return idf_matrix

def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    for sent, tf_table in tf_matrix.items():
        tf_idf_table = {}
        idf_table = idf_matrix.get(sent, {})
        for word, tf_val in tf_table.items():
            idf_val = idf_table.get(word, 0.0)
            tf_idf_table[word] = tf_val * idf_val
        tf_idf_matrix[sent] = tf_idf_table
    return tf_idf_matrix

def _score_sentences(tf_idf_matrix):
    sentenceValue = {}
    for sent, f_table in tf_idf_matrix.items():
        total_score = sum(f_table.values())
        count_words = len(f_table)
        if count_words == 0: continue
        sentenceValue[sent] = total_score / count_words
    return sentenceValue

def _find_average_score(sentenceValue):
    if not sentenceValue: return 0
    return sum(sentenceValue.values()) / len(sentenceValue)

# --- C. EKSEKUSI ---

# 1. Frequency Matrix
freq_matrix = _create_frequency_matrix(sentences)

# 2. Document Frequency (Helper kecil)
doc_per_words = {}
for sent, f_table in freq_matrix.items():
    for word in f_table.keys():
        doc_per_words[word] = doc_per_words.get(word, 0) + 1

# 3. TF Matrix
tf_matrix = _create_tf_matrix(freq_matrix)

# 4. IDF Matrix
idf_matrix = _create_idf_matrix(freq_matrix, doc_per_words, total_documents)

# 5. TF-IDF Matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

# 6. Score & Threshold
sentence_scores = _score_sentences(tf_idf_matrix)
threshold = _find_average_score(sentence_scores)

print("Threshold Score:", threshold)
print("\nScores:", sentence_scores)

Threshold Score: 0.04297405185783626

Scores: {'Python is an in': 0.05587324064713622, 'Its design phil': 0.04558063334751851, 'Its language co': 0.02746828157885405}
