# **Ekstraksi Kata Kunci Berita**

## Inport Library

In [None]:
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Kuliah /Semester 7/Pencarian Dan Penambangan Web/ppw/Dataset/Data Berita CNBC.csv')
df

Unnamed: 0,Url,Judul,Tanggal,Author,Artikel,Category
0,https://www.cnbcindonesia.com/tech/20231106180...,Astronom Yakin Alien Sudah Tahu di Bumi Ada Ma...,06 November 2023 19:50,"Intan Rakhmayanti Dewi, CNBC Indonesia","Jakarta, CNBC Indonesia - Astronom yakin alien...",Tech
1,https://www.cnbcindonesia.com/tech/20231106175...,Scarlett Johansson Ngamuk Wajah dan Suaranya D...,06 November 2023 19:00,"Intan Rakhmayanti Dewi, CNBC Indonesia","Jakarta, CNBC Indonesia - Aktrisnominasi Oscar...",Tech
2,https://www.cnbcindonesia.com/tech/20231106164...,"Dulu Bikin Ghozali Kaya Raya, Startup Ini Peca...",06 November 2023 18:20,"Novina Putri Bestari, CNBC Indonesia","Jakarta, CNBC Indonesia - Masih ingat Ghozali ...",Tech
3,https://www.cnbcindonesia.com/tech/20231106162...,"Jeff Bezos Pindah ke Rumah Rp 2,3 T Demi Pacar...",06 November 2023 17:40,"Novina Putri Bestari, CNBC Indonesia","Jakarta, CNBC Indonesia - Jeff Bezos memutuska...",Tech
4,https://www.cnbcindonesia.com/tech/20231106172...,BRIBRAIN Sukses Raih Penghargaan Ini di IDC Aw...,06 November 2023 17:33,"Khoirul Anam, CNBC Indonesia","Jakarta, CNBC Indonesia- PT Bank Rakyat Indone...",Tech
...,...,...,...,...,...,...
1495,https://www.cnbcindonesia.com/entrepreneur/202...,"Bisnis Kopi Meluas, Kompetisi Barista dan Latt...",03 April 2023 12:41,"Zefanya Aprilia, CNBC Indonesia","Jakarta, CNBCIndonesia -Roadshow Bezzera Latte...",Entrepreneur
1496,https://www.cnbcindonesia.com/entrepreneur/202...,"Rahusna, Alumni ITB Penemu Mesin Pemilah Sampa...",03 April 2023 12:33,"Mentari Puspadini, CNBC Indonesia","Jakarta, CNBC Indonesia - Hingga saat ini, per...",Entrepreneur
1497,https://www.cnbcindonesia.com/entrepreneur/202...,Pameran Barang Mewah Terbesar akan Hadir di RI...,31 March 2023 12:45,"Zefanya Aprilia, CNBC Indonesia","Jakarta, CNBCIndonesia -Indonesia akan menyele...",Entrepreneur
1498,https://www.cnbcindonesia.com/entrepreneur/202...,SETC & INOTEK Kerja Sama Gelar Pelatihan UMKM ...,31 March 2023 09:30,"Teti Purwanti, CNBC Indonesia","Jakarta, CNBC Indonesia - Sampoerna Entreprene...",Entrepreneur


## Cek NULL Value

In [None]:
df.isnull().sum()

Url         0
Judul       0
Tanggal     0
Author      0
Artikel     0
Category    0
dtype: int64

## Cleaning Data

In [None]:
def cleaning(text):
  text = re.sub(r'[^\w\s.?!,]', '', text).strip().lower()
  return text

df['Cleaning'] = df['Artikel'].apply(cleaning)
df['Cleaning']

0       jakarta, cnbc indonesia  astronom yakin alien ...
1       jakarta, cnbc indonesia  aktrisnominasi oscar,...
2       jakarta, cnbc indonesia  masih ingat ghozali y...
3       jakarta, cnbc indonesia  jeff bezos memutuskan...
4       jakarta, cnbc indonesia pt bank rakyat indones...
                              ...                        
1495    jakarta, cnbcindonesia roadshow bezzera latte ...
1496    jakarta, cnbc indonesia  hingga saat ini, perm...
1497    jakarta, cnbcindonesia indonesia akan menyelen...
1498    jakarta, cnbc indonesia  sampoerna entrepreneu...
1499    jakarta, cnbc indonesia  masalah postur tubuh ...
Name: Cleaning, Length: 1500, dtype: object

## Stopword Removal

In [None]:
corpus = stopwords.words('indonesian')

def stopwordText(text):
  return ' '.join(word for word in text.split() if word not in corpus)

df['Stopword Removal'] = df['Cleaning'].apply(stopwordText)

## Tokenizing

In [None]:
def tokenizer(text):
  text = text.lower()
  return sent_tokenize(text)

df['Tokenizing'] = df['Stopword Removal'].apply(tokenizer)
df['Tokenizing']

0       [jakarta, cnbc indonesia astronom alien bumi k...
1       [jakarta, cnbc indonesia aktrisnominasi oscar,...
2       [jakarta, cnbc indonesia ghozali mendadak kaya...
3       [jakarta, cnbc indonesia jeff bezos memutuskan...
4       [jakarta, cnbc indonesia pt bank rakyat indone...
                              ...                        
1495    [jakarta, cnbcindonesia roadshow bezzera latte...
1496    [jakarta, cnbc indonesia ini, permasalahan sam...
1497    [jakarta, cnbcindonesia indonesia menyelenggar...
1498    [jakarta, cnbc indonesia sampoerna entrepreneu...
1499    [jakarta, cnbc indonesia postur tubuh tulang b...
Name: Tokenizing, Length: 1500, dtype: object

## Fungsi Plot Graph

In [None]:
def plot_graph(G, figsize=(35, 30), node_size=700, node_color='skyblue'):
  # Menggambar graf dengan canvas yang diperbesar
  pos = nx.spring_layout(G)  # Menentukan posisi simpul
  labels = nx.get_edge_attributes(G, 'weight')

  # Menentukan ukuran canvas
  plt.figure(figsize=(35, 30))

  # Menggambar graf dengan ukuran canvas yang diperbesar
  nx.draw(G, pos, with_labels=True, node_size=700, node_color='skyblue')
  nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_color='red')
  plt.show()

## Fungsi Membangun Graph dengan Co-occurence Matriks

In [None]:
def graph_co_occurrence(x, threshold=0, show_matrics = False):
    vectorizer = CountVectorizer()
    tfidf_matrics = vectorizer.fit_transform(x)
    co_occurrence_matrix = tfidf_matrics.T.dot(tfidf_matrics).toarray()
    df_co_occurence = pd.DataFrame(co_occurrence_matrix, columns=vectorizer.get_feature_names_out())

    if show_matrics:
      display(df_co_occurence)

    G = nx.DiGraph()

    # Menambahkan edge ke graf berdasarkan matriks co-occurrence
    for i in range(len(co_occurrence_matrix)):
      for j in range(i + 1, len(co_occurrence_matrix)):
        weight = co_occurrence_matrix[i, j]
        if weight > threshold:
          G.add_edge(vectorizer.get_feature_names_out()[i], vectorizer.get_feature_names_out()[j], weight=weight)
    return G

## Fungsi Ekstrak Kata Teratas

In [None]:
def extract_top_words(x, w=None, threshold=0, show_matrics=False, show_scores=False, index=None):
    full_text = ' '.join(word for word in x)

    G = graph_co_occurrence(x, threshold, show_matrics)

    # Menghitung nilai dari PageRank (TextRank)
    scores = nx.pagerank(G)

    # Dictionary untuk menyimpan skor tertinggi setiap kata
    ranked_words_dict = {}

    for word in ' '.join(x).split():
        current_score = scores.get(word, 0)
        if word not in ranked_words_dict or current_score > ranked_words_dict[word]:
            ranked_words_dict[word] = current_score

    # Mengurutkan kata-kata berdasarkan skor tertinggi
    ranked_words = sorted(((score, word) for word, score in ranked_words_dict.items()), key=lambda x: (x[0], x[1]), reverse=True)

    # Memilih sejumlah w kata tertinggi
    selected_words = [word for _, word in sorted(ranked_words[:w], key=lambda x: x[1])] if w is not None else None

    # Menggabungkan kata-kata menjadi satu string terpisah dengan koma
    keywords = ', '.join(selected_words) if selected_words else ''

    if show_scores:
      print(f'Dokumen ke {index} : {full_text}')
      print(f'{w} Kata Kunci : {keywords}')
      print("TextRank Scores:")
      for score, word in ranked_words:
          print(f"Skor: {score}, Kata: {word}")

    return (G, selected_words)

## Fungsi Ekstrak Kata Kunci Ke Semua Berita

In [None]:
def extract_all(x, w=4, plot=False, show_matrics=False, show_scores=False, index=1, threshold=0):

  # G = graph_co_occurrence(x, show_matrics)

  G, top_words = extract_top_words(x, w, threshold, show_matrics, show_scores, index)

  if plot:
    print()
    print("Plot Graph :")
    plot_graph(G)

  # Menggabungkan kata-kata menjadi satu string terpisah dengan koma
  keywords = ', '.join(top_words) if top_words else ''

  return keywords

## Contoh Penggunaan 1 Data

### Ambil 1 Data

In [None]:
example = df["Tokenizing"].iloc[1]
example

['jakarta, cnbc indonesia aktrisnominasi oscar, scarlett johansson, menempuh jalur hukum wajah suaranya ditiru ai.',
 'marah menuntut pengembang aplikasi ai kemiripan wajah suara iklan izin.',
 'iklan berdurasi 22 detik mempromosikan editor gambar ai bernama lisa ai 90s yearbook avatar, dilaporkan versi suara gambar johansson ai.',
 'iklan menampilkan klip asli johansson layar black widow, bertuliskan temanteman?',
 'scarlett denganku.... beralih foto dihasilkan ai versi kloning suaranya mempromosikan aplikasi ai.',
 'tayangan cetakan gambar lisa ai.',
 'hubungannya orang ini.',
 'gugatan hukum dilayangkan, iklan muncul twitter.',
 'aplikasi lisa ai convert software app store google play.',
 'pengacara johansson, kecin yorn, kliennya menangani situasi sesuai kapasitas hukum menganggap enteng perkara tiruan teknologi ai.',
 'tindakan lakukan situasi ini, menanganinya upaya hukum miliki, dikutip engadget, senin 6112023. johansson memiliki salah wajah suara terkenal hollywood.',
 'juru bi

### Ekstrak Keywoards

In [None]:
keyword =  extract_all(example, w=3, show_matrics=True, plot=True, show_scores=True, index=1)

Output hidden; open in https://colab.research.google.com to view.

### Keywoards Yang di hasilkan

In [None]:
keyword

'wajah, wilayah, yearbook'

## Ekstrak Keyword di semua Berita

In [None]:
df["Kata Kunci"] = df.apply(lambda row: extract_all(row["Tokenizing"], w=3, show_scores=True, index=row.name+1), axis=1)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
Skor: 0.0012182492216961057, Kata: 24
Skor: 0.0012182492216961057, Kata: 2021
Skor: 0.0012182492216961057, Kata: 12
Skor: 0, Kata: usaha.
Skor: 0, Kata: upgrade,
Skor: 0, Kata: ungkapnya.
Skor: 0, Kata: umumnya.
Skor: 0, Kata: toraja,
Skor: 0, Kata: tinggi.
Skor: 0, Kata: tinggal.
Skor: 0, Kata: tertulis,
Skor: 0, Kata: tersebut.
Skor: 0, Kata: tersebut,
Skor: 0, Kata: terbaik.
Skor: 0, Kata: sunarso.
Skor: 0, Kata: sub.
Skor: 0, Kata: selesai.
Skor: 0, Kata: selatan,
Skor: 0, Kata: selanjutnya,
Skor: 0, Kata: sekolahnya,
Skor: 0, Kata: sekolah.
Skor: 0, Kata: sekolah,
Skor: 0, Kata: sejahtera,
Skor: 0, Kata: saya,
Skor: 0, Kata: sabariah.
Skor: 0, Kata: rumah.
Skor: 0, Kata: rentenir.
Skor: 0, Kata: ratulangi,
Skor: 0, Kata: permodalan,
Skor: 0, Kata: pegadaian.
Skor: 0, Kata: pegadaian,
Skor: 0, Kata: nasional.
Skor: 0, Kata: namun,
Skor: 0, Kata: miliknya.
Skor: 0, Kata: mikro.
Skor: 0, Kata: mikro,
Skor: 0, Ka

## Membuat DataFrame Baru dengan 3 Kolom (Artikel, Kata Kunci, dan Category)

In [None]:
df_KataKunci = df[['Artikel', 'Kata Kunci', 'Category']]
df_KataKunci

Unnamed: 0,Artikel,Kata Kunci,Category
0,"Jakarta, CNBC Indonesia - Astronom yakin alien...","sinyal, university, zaman",Tech
1,"Jakarta, CNBC Indonesia - Aktrisnominasi Oscar...","wajah, wilayah, yearbook",Tech
2,"Jakarta, CNBC Indonesia - Masih ingat Ghozali ...","signifikan, tim, versi",Tech
3,"Jakarta, CNBC Indonesia - Jeff Bezos memutuska...","seattle, tinggal, tuanya",Tech
4,"Jakarta, CNBC Indonesia- PT Bank Rakyat Indone...","tenaga, tingkat, usaha",Tech
...,...,...,...
1495,"Jakarta, CNBCIndonesia -Roadshow Bezzera Latte...","ticket, visual, wedana",Entrepreneur
1496,"Jakarta, CNBC Indonesia - Hingga saat ini, per...","sampah, wealth, zero",Entrepreneur
1497,"Jakarta, CNBCIndonesia -Indonesia akan menyele...","things, tujuan, watch",Entrepreneur
1498,"Jakarta, CNBC Indonesia - Sampoerna Entreprene...","umkm, uui, yayasan",Entrepreneur


## Save DataFrame Baru

In [None]:
df_KataKunci.to_csv('Ekstraksi Kata Kunci.csv')

## Deployment

[teks link](https://)