# **Text Rank untuk mencari kata kunci data berita**

## **Ambil data berita**

In [None]:
import os

%cd /content/
os.mkdir('data_berita')

import gdown
# download data

nama_data = '/content/data_berita/data_berita.csv'
gdown.download(f'https://drive.google.com/uc?id=1vu67IeDIAWTCH9hk4C39DrH9JKXHpUVi', nama_data, quiet=False)

In [None]:
import pandas as pd

data_berita = pd.read_csv('/content/data_berita/data_berita.csv')
data_berita

## **Normalisasi**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np
import re
import warnings
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings('ignore')

### **Cek data kosong**

In [None]:
# cek apakah ada data yang kosong
data_berita.isnull().sum()

### **Punctuation**

In [None]:
data_berita['berita_clean'] = data_berita['Berita'].str.replace(r'[^\w\s,.?!]', '', regex=True).str.lower()

In [None]:
data_berita['berita_clean'] = data_berita['berita_clean'].str.replace('\d+', '', regex=True)

### **Stopword**

In [None]:
# Inisialisasi kamus stop words dari NLTK
stop_words = set(stopwords.words('indonesian'))  # Inisialisasi kamus stop words di luar loop
# Fungsi untuk menghapus stop words
def remove_stopwords(text):
    if isinstance(text, str):  # Periksa apakah teks adalah string
        return ' '.join(word for word in text.split() if word not in stop_words)
    else:
        return ''  # Mengembalikan string kosong untuk nilai non-string

data_berita['berita_Stopword'] = data_berita['berita_clean'].apply(remove_stopwords)
data_berita

### **Tokenisasi**

In [None]:
data_berita['berita_token'] = data_berita['berita_Stopword'].apply(sent_tokenize)
data_berita

### **Gabung Hasil Tokenizing**

In [None]:
data_berita['full_text'] = data_berita['berita_token'].apply(lambda tokens: ' '.join(tokens))
data_berita

## **Term Freq**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
tf_result = []
for i in range(len(data_berita['berita_token'])):
  tf_vectorizer = CountVectorizer()
  tf_matrix = tf_vectorizer.fit_transform(data_berita['berita_token'][i]).toarray()
  terms = tf_vectorizer.get_feature_names_out()
  berita_tf = pd.DataFrame(tf_matrix, columns=terms)
  tf_result.append(berita_tf)


for i in range(len(tf_result[:10])):
  tf_show = tf_result[i]
  print(f"=====================Berita ke-{i}======================")
  display(tf_show)

## **Co-Occurance Matrics**

In [None]:
co_occurrence_df_all = []
co_occurrence_matrix_all = []
for i in range(len(tf_result)):
  # Membuat Co-occurrence Matrix
  co_occurrence_matrix = np.dot(tf_result[i].T, tf_result[i])
  # Mengganti diagonal dengan nol (karena kita tidak ingin memperhitungkan kata dengan dirinya sendiri)
  np.fill_diagonal(co_occurrence_matrix, 0)
  # mengambil columns
  terms = tf_result[i].columns
  # Membuat DataFrame Co-occurrence
  co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=terms, columns=terms)
  co_occurrence_df_all.append(co_occurrence_df)
  co_occurrence_matrix_all.append(co_occurrence_matrix)


for i in range(len(co_occurrence_df_all[:10])):
  co_occurrence_show = co_occurrence_df_all[i]
  print(f"=====================CO-Occurence ke-{i}======================")
  display(co_occurrence_show)

### **Membuat Graph Co-Occurance**

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
graf_results = []
treshold = 0

for i in range(len(co_occurrence_df_all)):
    graf = nx.Graph()  # Membuat objek Graf
    co_occurrence = co_occurrence_df_all[i]
    koloms = co_occurrence.columns
    matrix_cooccurrence = co_occurrence_matrix_all[i]

    G = nx.from_pandas_adjacency(co_occurrence)
    graf_results.append(G)

for i, G in enumerate(graf_results[:3]):
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=True, font_size=8, font_color='black', node_size=700, node_color='skyblue', edge_color='gray', linewidths=0.1)
    plt.title(f'Graf {i}')
    plt.show()
    print('\n')

In [None]:
graf_results = []
treshold = 0
for i in range(len(co_occurrence_df_all)):
  # print('Dokumen ke',i)
  graf = nx.Graph()  # Instantiate as a Graph object
  co_occurrence = co_occurrence_df_all[i] #mengambil hasil coocurence per indeks yang disimpan array
  koloms = co_occurrence.columns #mengambil nama kolom saja dari coocurence array indeks
  matrix_coocurrence = co_occurrence_matrix_all[i] #mengambil matrix coocurencenya

  for i_koloms in range(len(koloms)): #perulangan setiap dari panjang kolom dari coocurence
      for j_koloms in range(i_koloms + 1, len(koloms)): #perulangan dari setiap panjang kolom + 1, maksudna kolom indeks ke dua
        bobot = matrix_coocurrence[i_koloms,j_koloms] #mencari bobot dengan setiap kolom
        if bobot > treshold:
          graf.add_edge(koloms[i_koloms], koloms[j_koloms], weight=matrix_coocurrence[i_koloms, j_koloms])
  graf_results.append(graf)


In [None]:
# menampilkan graf

for i in range(len(graf_results[:3])):
  pos = nx.spring_layout(graf_results[i])  ## Menentukan posisi/koordinat simpul
  labels = nx.get_edge_attributes(graf_results[i], 'weight') ##mendapatkan atribut berbobot ('weight') dari setiap sisi dalam graf.

  # Menentukan ukuran canvas
  plt.figure(figsize=(50, 30))

  # Menggambar graf dengan ukuran canvas yang diperbesar
  nx.draw(graf_results[i], pos, with_labels=True, node_size=2000, node_color='skyblue')
  nx.draw_networkx_edge_labels(graf_results[i], pos, edge_labels=labels, font_color='red')
  print(f"====== Dokumen ke - {i} ===========")
  plt.show()


## **Centrality**

### **Closeness Centrality**

In [None]:
closeness_centrality_centrality_result = []
for i in range(len(graf_results)):
  # print(f'========= document ke-{i} ===============')
  centrality = nx.closeness_centrality(graf_results[i]) #menjadikan graf setiap index list diclosness
  # print(centrality)
  # print(f'Text rank score : {centrality}')
  closeness_centrality_centrality_result.append(centrality)
  # print(i)

In [None]:
# menampilkan centrality
for i in range(len(closeness_centrality_centrality_result[:5])):
  print(f"================== Dokumen ke - {i} ==================")
  print(f"Closeness Centrality score : {closeness_centrality_centrality_result[i]}")

#### merangking kata kunci

In [None]:
sorted_closeness_all = []
for i in range(len(closeness_centrality_centrality_result)):
  sorted_closeness = dict(sorted(closeness_centrality_centrality_result[i].items(), key=lambda item: item[1], reverse=True))
  sorted_closeness_all.append(sorted_closeness)

sorted_closeness_all[0]

In [None]:
kalimat_tertinggi_all = []

for i in range(len(sorted_closeness_all)):
    kalimat_tertinggi = []
    for j in range(5):
        max_key = None
        max_value = None

        for key, value in sorted_closeness_all[i].items():
            if key not in kalimat_tertinggi and (max_value is None or value > max_value):
                max_key = key
                max_value = value

        if max_key is not None:
            kalimat_tertinggi.append(max_key)

    kalimat_tertinggi_all.append(kalimat_tertinggi)

In [None]:
for i in range(len(kalimat_tertinggi_all)):
  kata_kunci = kalimat_tertinggi_all[i]
  print(f'Kata Kunci Dokumen ke-{i} adalah {kata_kunci}')

#### **Meranking node dari centrality**

In [None]:
# mengambil kalimat sesuai kata dari centrality
result_skor = []
for index_centrality in range(len(closeness_centrality_centrality_result)):
  # print(index_centrality)
  current_skor = []
  for sentences in data_berita['berita_token'][index_centrality]:
    skor_kata = {}
    words = sentences.split()
    pin_centrality = closeness_centrality_centrality_result[index_centrality]
    # print(words)
    panjang_words = len(words)
    # print(panjang_words)
    for i_word in range(panjang_words):
      # print(words[i_word])
      if words[i_word] in pin_centrality:
        skor_kata[words[i_word]] = pin_centrality[words[i_word]]
      else:
        skor_kata[words[i_word]] = 0
    current_skor.append(skor_kata)
  result_skor.append(current_skor)

In [None]:
# jumlah bobot centrality dari setiap kata dalam kalimat
result_bobot_kalimat = []

for i in range(len(result_skor)):
    current_bobot = {}

    for j in range(len(result_skor[i])):
        bobot_kalimat = sum(result_skor[i][j].values())
        current_bobot[data_berita['berita_token'][i][j]] = bobot_kalimat

    result_bobot_kalimat.append(current_bobot)


In [None]:
data_berita['berita_token'][1]

In [None]:
result_bobot_kalimat[1]

#### **Menentukan Kata penting berdasarkan ranking**

In [None]:
# merangking kalimat tertinggi berdasarkan perhitungan sebelumnya
kalimat_tertinggi = []
for i in range(len(result_bobot_kalimat)):
  # Mengurutkan dictionary berdasarkan nilai (dari besar ke kecil)
  sorted_data = dict(sorted(result_bobot_kalimat[i].items(), key=lambda item: item[1], reverse=True))

  # Mengambil 5 nilai teratas beserta key-nya
  top_5_keys = {k: sorted_data[k] for k in list(sorted_data)[:5]}
  # Menggabungkan key menjadi satu string
  result_string = ' '.join(top_5_keys)

  kalimat_tertinggi.append(result_string)

# tampilkan

for i in range(len(kalimat_tertinggi)):
  print(f'===== Dokumen {i} ======')
  print(kalimat_tertinggi[i])
  print('\n')

In [None]:
data_berita_ringkasan = []
for i in range(len(kalimat_tertinggi)):
  data_berita_ringkasan.append({
      'Summary' : kalimat_tertinggi[i],
      'Label' : data_berita['Label'][i]
  })

In [None]:
import csv
# Menyimpan data dalam bentuk CSV
csv_filename = 'data_berita_closeness_centrality.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Summary', 'Label']
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()

    for entry in data_berita_ringkasan:
        csv_writer.writerow(entry)

print(f"Data telah disimpan dalam file {csv_filename}")

### **Page Rank**

tugas tambahan perhitungan manual dengan page rank

In [None]:
page_rank_centrality_result = []
for i in range(len(graf_results)):
  centrality = nx.pagerank(graf_results[i]) #menjadikan graf setiap index list diclosness
  page_rank_centrality_result.append(centrality)

In [None]:
# menampilkan centrality
for i in range(len(page_rank_centrality_result[:5])):
  print(f"================== Dokumen ke - {i} ==================")
  print(f"Closeness Centrality score : {page_rank_centrality_result[i]}")

#### meranking kata kunci

In [None]:
sorted_page_rank_all = []
for i in range(len(page_rank_centrality_result)):
  sorted_page_rank = dict(sorted(page_rank_centrality_result[i].items(), key=lambda item: item[1], reverse=True))
  sorted_page_rank_all.append(sorted_page_rank)

sorted_page_rank_all[0]

In [None]:
kalimat_tertinggi_all = []

for i in range(len(sorted_page_rank_all)):
    kalimat_tertinggi = []
    for j in range(5):
        max_key = None
        max_value = None

        for key, value in sorted_page_rank_all[i].items():
            if key not in kalimat_tertinggi and (max_value is None or value > max_value):
                max_key = key
                max_value = value

        if max_key is not None:
            kalimat_tertinggi.append(max_key)

    kalimat_tertinggi_all.append(kalimat_tertinggi)

In [None]:
for i in range(len(kalimat_tertinggi_all)):
  kata_kunci = kalimat_tertinggi_all[i]
  print(f'Kata Kunci Dokumen ke-{i} adalah {kata_kunci}')

#### **Merangking Node dari Page Rank**

In [None]:
result_skor = []
for index_centrality in range(len(page_rank_centrality_result)):
  # print(index_centrality)
  current_skor = []
  for sentences in data_berita['berita_token'][index_centrality]:
    skor_kata = {}
    words = sentences.split()
    pin_centrality = page_rank_centrality_result[index_centrality]
    # print(words)
    panjang_words = len(words)
    # print(panjang_words)
    for i_word in range(panjang_words):
      # print(words[i_word])
      if words[i_word] in pin_centrality:
        skor_kata[words[i_word]] = pin_centrality[words[i_word]]
      else:
        skor_kata[words[i_word]] = 0
    current_skor.append(skor_kata)
  result_skor.append(current_skor)

In [None]:
result_bobot_kalimat = []

for i in range(len(result_skor)):
    current_bobot = {}

    for j in range(len(result_skor[i])):
        bobot_kalimat = sum(result_skor[i][j].values())
        current_bobot[data_berita['berita_token'][i][j]] = bobot_kalimat

    result_bobot_kalimat.append(current_bobot)

#### **Menentukan Kata Penting Berdasarkan Ranking**

In [None]:
kalimat_tertinggi = []
for i in range(len(result_bobot_kalimat)):
  # Mengurutkan dictionary berdasarkan nilai (dari besar ke kecil)
  sorted_data = dict(sorted(result_bobot_kalimat[i].items(), key=lambda item: item[1], reverse=True))

  # Mengambil 5 nilai teratas beserta key-nya
  top_5_keys = {k: sorted_data[k] for k in list(sorted_data)[:5]}
  # Menggabungkan key menjadi satu string
  result_string = ' '.join(top_5_keys)

  kalimat_tertinggi.append(result_string)

# tampilkan

for i in range(len(kalimat_tertinggi)):
  print(f'===== Dokumen {i} ======')
  print(kalimat_tertinggi[i])
  print('\n')

In [None]:
data_berita_ringkasan = []
for i in range(len(kalimat_tertinggi)):
  data_berita_ringkasan.append({
      'Summary' : kalimat_tertinggi[i],
      'Label' : data_berita['Label'][i]
  })

In [None]:
import csv
# Menyimpan data dalam bentuk CSV
csv_filename = 'data_berita_page_rank.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Summary', 'Label']
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()

    for entry in data_berita_ringkasan:
        csv_writer.writerow(entry)

print(f"Data telah disimpan dalam file {csv_filename}")

## **Prepocessing data ringkasan**

In [None]:
import os

# %cd /content/
# os.mkdir('data_berita')

import gdown
# download data

nama_data = '/content/data_berita/data_berita_page_rank.csv'
gdown.download(f'https://drive.google.com/uc?id=1U-1ynzXT0ScSnIc1XKv8lXQveFRvoV9V', nama_data, quiet=False)

In [None]:
import pandas as pd

data_page_rank = pd.read_csv('/content/data_berita/data_berita_page_rank.csv')
data_page_rank

### **Cleaning data**

In [None]:
data_page_rank.isnull().sum()

## Punctuation

In [None]:
import string

data_page_rank['ringkasan_punctuation'] = data_page_rank['Summary'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

# Menghilangkan angka dari kolom 'new_abstrak'
data_page_rank['ringkasan_punctuation'] = data_page_rank['ringkasan_punctuation'].str.replace('\d+', '', regex=True)

data_page_rank

## Stopword

In [None]:
# import library dan download nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Inisialisasi kamus stop words dari NLTK
stop_words = set(stopwords.words('indonesian'))  # Inisialisasi kamus stop words di luar loop
# Fungsi untuk menghapus stop words
def remove_stopwords(text):
    if isinstance(text, str):  # Periksa apakah teks adalah string
        return ' '.join(word for word in text.split() if word not in stop_words)
    else:
        return ''  # Mengembalikan string kosong untuk nilai non-string

data_page_rank['ringkasan_Stopword'] = data_page_rank['ringkasan_punctuation'].apply(remove_stopwords)
data_page_rank

## Tokenizer

In [None]:
data_page_rank["tokenizing"] = data_page_rank['ringkasan_Stopword'].apply(sent_tokenize)

In [None]:
data_page_rank

## TF-IDF DATA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data_page_rank_text = data_page_rank['tokenizing'].apply(lambda tokens: ' '.join(tokens))

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_page_rank_text).toarray()
terms = tfidf_vectorizer.get_feature_names_out()

data_page_rank_tfidf = pd.DataFrame(tfidf_matrix, columns=terms)
data_page_rank_tfidf.insert(0, 'Ringkasan', data_page_rank_text)

data_page_rank_tfidf

## Menggabung TF-IDF dengan Label

In [None]:
tfidf_label = pd.concat([data_page_rank_tfidf,data_page_rank['Label']],axis=1)
tfidf_label

## Modeling

### Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
x = tfidf_label.iloc[:, 1:-1]
y = tfidf_label.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

y_pred = naive_bayes.predict(X_test)

accuracy_naiveBayes = accuracy_score(y_test, y_pred)
print(f'Akurasi model Naive Bayes: {accuracy_naiveBayes}')

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

# Latih model pada kolom topik tertentu
knn.fit(X_train, y_train)

# Lakukan prediksi
y_pred = knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred)
print(f'Akurasi model KNN: {accuracy_knn}')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Inisialisasi model Naive Bayes
rf = RandomForestClassifier()

# Latih model pada kolom topik tertentu
rf.fit(X_train, y_train)

# Lakukan prediksi
y_pred = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred)
print(f'Akurasi model Random Forest: {accuracy_rf}')

## Evaluasi Model

In [None]:
import matplotlib.pyplot as plt

models = ['Naive Bayes', 'KNN', 'Random Forest']
accuracies = [accuracy_naiveBayes, accuracy_knn, accuracy_rf]

plt.bar(models, accuracies, color=['blue', 'green', 'red'])
plt.ylabel('Akurasi Model')
plt.title('Akurasi Model Machine Learning')
plt.ylim(0, 1)  # Sesuaikan dengan rentang akurasi (0-1)
plt.show()