# Klasifikasi Berita dengan Metode KNN

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Memuat Data

In [6]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/Penambangan WEB/Data/Data_Berita_All_Kategori.csv'
df = pd.read_csv(data_path)


Mounted at /content/drive


# Case Folding
membuat huruf menjadi lower

In [11]:
df = df.astype(str)
df["Berita"] = df["Berita"].apply(lambda x: x.lower())
df

Unnamed: 0,Judul,Berita,Kategori,processed_berita
0,Hasil Persib Vs PSM Makassar 0-0: Gagal Menang...,hasil persib vs psm makassar 0-0: gagal menang...,bola,hasil persib vs psm makassar 0-0: gagal menang...
1,Manchester City Pantau Bintang Argentina yang ...,manchester city pantau bintang argentina yang ...,bola,manchester city pantau bintang argentina bersi...
2,"Sepak Terjang Paris Brunner, Sang Pemain Terba...","sepak terjang paris brunner, sang pemain terba...",bola,"sepak terjang paris brunner, sang pemain terba..."
3,"Musim Gemilang Astra Honda Racing Team, Domina...","musim gemilang astra honda racing team, domina...",bola,"musim gemilang astra honda racing team, domina..."
4,Pembelajaran dari Piala Dunia U17 2023 demi Se...,pembelajaran dari piala dunia u17 2023 demi se...,bola,pembelajaran piala dunia u17 2023 sepak bola t...
...,...,...,...,...
391,Fitur di Smartwatch Huawei Watch GT 4 Bisa Pan...,fitur di smartwatch huawei watch gt 4 bisa pan...,tekno,fitur smartwatch huawei watch gt 4 pantau pola...
392,"Hands-on Gelang Pintar Xiaomi Smart Band 8, Ga...","hands-on gelang pintar xiaomi smart band 8, ga...",tekno,"hands-on gelang pintar xiaomi smart band 8, ga..."
393,"Google Rayakan Ulang Tahun Ke-25, Ini Sejarah ...","google rayakan ulang tahun ke-25, ini sejarah ...",tekno,"google rayakan ulang ke-25, sejarah berdirinya..."
394,"Social Commerce dan E-commerce, Apa Bedanya? B...","social commerce dan e-commerce, apa bedanya? b...",tekno,"social commerce e-commerce, bedanya? penjelasa..."


# Tokenisasi

In [16]:
def process_tokenize(text):
    text = text.split()
    return text

df["processed_berita"] = df["Berita"].apply(process_tokenize)
print(df["processed_berita"])

0      [hasil, persib, vs, psm, makassar, 0-0:, gagal...
1      [manchester, city, pantau, bintang, argentina,...
2      [sepak, terjang, paris, brunner,, sang, pemain...
3      [musim, gemilang, astra, honda, racing, team,,...
4      [pembelajaran, dari, piala, dunia, u17, 2023, ...
                             ...                        
391    [fitur, di, smartwatch, huawei, watch, gt, 4, ...
392    [hands-on, gelang, pintar, xiaomi, smart, band...
393    [google, rayakan, ulang, tahun, ke-25,, ini, s...
394    [social, commerce, dan, e-commerce,, apa, beda...
395    [samsung, sudah, siapkan, galaxy, s25,, ini, b...
Name: processed_berita, Length: 396, dtype: object


# Punctuation Removal


In [17]:
def process_punctuation(tokens):
    cleaned_tokens = [re.sub(r'[.,():-]', '', token) for token in tokens]
    cleaned_tokens = [re.sub(r'\d+', '', token) for token in cleaned_tokens]
    return cleaned_tokens

df['processed_berita'] = df['processed_berita'].apply(process_punctuation)
print(df["processed_berita"])

0      [hasil, persib, vs, psm, makassar, , gagal, me...
1      [manchester, city, pantau, bintang, argentina,...
2      [sepak, terjang, paris, brunner, sang, pemain,...
3      [musim, gemilang, astra, honda, racing, team, ...
4      [pembelajaran, dari, piala, dunia, u, , demi, ...
                             ...                        
391    [fitur, di, smartwatch, huawei, watch, gt, , b...
392    [handson, gelang, pintar, xiaomi, smart, band,...
393    [google, rayakan, ulang, tahun, ke, ini, sejar...
394    [social, commerce, dan, ecommerce, apa, bedany...
395    [samsung, sudah, siapkan, galaxy, s, ini, bukt...
Name: processed_berita, Length: 396, dtype: object


# Stopword Removal


In [18]:
def process_stopword_token(tokens):
    stop_words = set(stopwords.words("indonesian"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return " ".join(filtered_tokens)

df['processed_berita'] = df['processed_berita'].apply(process_stopword_token)
print(df["processed_berita"])

0      hasil persib vs psm makassar  gagal menang mau...
1      manchester city pantau bintang argentina bersi...
2      sepak terjang paris brunner sang pemain terbai...
3      musim gemilang astra honda racing team dominas...
4      pembelajaran piala dunia u  sepak bola tanah a...
                             ...                        
391    fitur smartwatch huawei watch gt  pantau pola ...
392    handson gelang pintar xiaomi smart band  ganti...
393    google rayakan ulang sejarah berdirinya tim re...
394    social commerce ecommerce bedanya? penjelasann...
395    samsung siapkan galaxy s buktinya tim redaksi ...
Name: processed_berita, Length: 396, dtype: object


# Membagi data menjadi set pelatihan dan pengujian

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_berita'], df['Kategori'], test_size=0.2, random_state=42)

# TF-IDF Vectorization

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names

array(['aan', 'aap', 'abad', ..., 'zulfikar', 'zulkifli', 'zumpano'],
      dtype=object)