In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Load the dataset
file_path = 'indonesian-news-title.csv'
news_data = pd.read_csv(file_path)

In [None]:
news_data = news_data.dropna()
news_data

Unnamed: 0,date,url,title,category
0,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,Kemnaker Awasi TKA di Meikarta,finance
1,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,BNI Digitalkan BNI Java Jazz 2020,finance
2,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,"Terbang ke Australia, Edhy Prabowo Mau Genjot ...",finance
3,02/26/2020,https://finance.detik.com/moneter/d-4916133/oj...,OJK Siapkan Stimulus Ekonomi Antisipasi Dampak...,finance
4,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,Saran Buat Anies-RK yang Mangkir Rapat Banjir ...,finance
...,...,...,...,...
91012,02/03/2020,https://travel.detik.com/travel-news/d-4882807...,"Ketumpahan Air Panas di Pesawat, Kamu Bisa Tun...",travel
91013,02/03/2020,https://travel.detik.com/fototravel/d-4882796/...,Foto: Bali & 9 Destinasi Paling Instagramable ...,travel
91014,02/03/2020,https://travel.detik.com/travel-news/d-4882794...,Game Bikin Turis Ini Liburan ke Jepang untuk.....,travel
91015,02/03/2020,https://travel.detik.com/travel-news/d-4882792...,"Sekeluarga Didepak dari Pesawat, Maskapai Bila...",travel


In [None]:
# Function to clean text (remove punctuation and lowercase)
def clean_text(text):
    # Cek apakah teks adalah string
    if isinstance(text, str):
        # Menghilangkan tanda baca dan mengubah teks ke huruf kecil
        text = re.sub(r'[^\w\s\d]', '', text).lower()
    else:
        # Jika bukan string, kembalikan string kosong
        text = ''
    return text

# Preprocessing: Clean the titles
news_data['title_cleaned'] = news_data['title'].apply(clean_text)

In [None]:
news_data

Unnamed: 0,date,url,title,category,title_cleaned
0,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,Kemnaker Awasi TKA di Meikarta,finance,kemnaker awasi tka di meikarta
1,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,BNI Digitalkan BNI Java Jazz 2020,finance,bni digitalkan bni java jazz 2020
2,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,"Terbang ke Australia, Edhy Prabowo Mau Genjot ...",finance,terbang ke australia edhy prabowo mau genjot b...
3,02/26/2020,https://finance.detik.com/moneter/d-4916133/oj...,OJK Siapkan Stimulus Ekonomi Antisipasi Dampak...,finance,ojk siapkan stimulus ekonomi antisipasi dampak...
4,02/26/2020,https://finance.detik.com/berita-ekonomi-bisni...,Saran Buat Anies-RK yang Mangkir Rapat Banjir ...,finance,saran buat aniesrk yang mangkir rapat banjir d...
...,...,...,...,...,...
91012,02/03/2020,https://travel.detik.com/travel-news/d-4882807...,"Ketumpahan Air Panas di Pesawat, Kamu Bisa Tun...",travel,ketumpahan air panas di pesawat kamu bisa tunt...
91013,02/03/2020,https://travel.detik.com/fototravel/d-4882796/...,Foto: Bali & 9 Destinasi Paling Instagramable ...,travel,foto bali 9 destinasi paling instagramable ta...
91014,02/03/2020,https://travel.detik.com/travel-news/d-4882794...,Game Bikin Turis Ini Liburan ke Jepang untuk.....,travel,game bikin turis ini liburan ke jepang untuk c...
91015,02/03/2020,https://travel.detik.com/travel-news/d-4882792...,"Sekeluarga Didepak dari Pesawat, Maskapai Bila...",travel,sekeluarga didepak dari pesawat maskapai bilan...


In [None]:
# Calculate TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(news_data['title_cleaned'])

In [None]:
# Function to perform the recommendation and show all sorted data with similarity scores
def recommend_news_full(keyword):
    # Convert the keyword to TF-IDF vector
    keyword_vector = tfidf_vectorizer.transform([clean_text(keyword)])

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(keyword_vector, tfidf_matrix).flatten()

    # Sort the news based on similarity scores
    sorted_indices = np.argsort(-cosine_similarities)
    similarity_scores = cosine_similarities[sorted_indices]
    sorted_news = news_data.iloc[sorted_indices]

    # Combine sorted news with their similarity scores
    sorted_news_with_scores = sorted_news[['date','title', 'url', 'category']]
    sorted_news_with_scores['similarity'] = similarity_scores

    return sorted_news_with_scores

In [None]:
sample_keyword = "ekonomi"
results = recommend_news_full(sample_keyword)
results

Unnamed: 0,date,title,url,category,similarity
13610,02/14/2020,Ekonomi RI Waspada Dampak Corona,https://finance.detik.com/berita-ekonomi-bisni...,finance,0.476884
7265,02/06/2020,5 Tahun Jokowi Ekonomi Mentok di 5%,https://finance.detik.com/energi/d-4887662/5-t...,finance,0.449729
9712,03/04/2020,"Diserang Corona, Ekonomi RI Bisa Bertahan?",https://finance.detik.com/berita-ekonomi-bisni...,finance,0.421157
112,02/26/2020,"Jakarta 'Tenggelam', Ekonomi Lumpuh",https://finance.detik.com/infografis/d-4914524...,finance,0.420515
12366,05/06/2020,Ekonomi RI 2020 Diprediksi Minus,https://finance.detik.com/berita-ekonomi-bisni...,finance,0.418661
...,...,...,...,...,...
30692,04/13/2020,Tolong! Anne Avantie Kesulitan Bahan untuk Bua...,https://hot.detik.com/celeb/d-4975164/tolong-a...,hot,0.000000
30691,04/13/2020,Titi DJ Lega Stephanie Poetri Bisa Pulang dari...,https://hot.detik.com/celeb/d-4975173/titi-dj-...,hot,0.000000
30690,04/13/2020,'The Green Hornet' dan 'Fortress 2' di Bioskop...,https://hot.detik.com/tv-news/d-4975170/the-gr...,hot,0.000000
30754,04/13/2020,Shailene Woodley Ngaku Pernah Sakit Fisik kare...,https://hot.detik.com/celeb/d-4974504/shailene...,hot,0.000000
