 # import required libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import tkinter as tk
from tkinter.scrolledtext import ScrolledText
from datasets import load_dataset

# import news-data and split into train and test

In [2]:
dataset = load_dataset("okite97/news-data")
news_df = pd.DataFrame(dataset['train'])

# data cleaning

In [3]:
news_df = news_df.dropna(subset=['Title', 'Excerpt'])
print(news_df.head())

                                               Title  \
0  Uefa Opens Proceedings against Barcelona, Juve...   
1  Amazon Blames Inflation as It Increases Cost o...   
2  Nigeria’s Parliament Passes Amended Electoral ...   
3  Nigeria: Lagos Governor Tests Positive for Cov...   
4  South Africa Calls For Calm as Electoral Refor...   

                                             Excerpt  Category  
0  Uefa has opened disciplinary proceedings again...    sports  
1  The increases are steeper than the 17 percent ...  business  
2  Nigeria's Senate on Tuesday passed the harmoni...  politics  
3  The Lagos State Governor, Mr. Babajide Sanwo-O...    health  
4  South Africa has raised concerns about the det...  politics  


# text verctorizer

In [4]:
text_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, 
                                  stop_words='english', use_idf=True)
tfidf_news = text_vectorizer.fit_transform(news_df['Title'] + ' ' + news_df['Excerpt'])

# create cluster

In [5]:
n_clusters = 3
news_clusters = KMeans(n_clusters=n_clusters, random_state=42)
news_clusters.fit(tfidf_news)

KMeans(n_clusters=3, random_state=42)

# predict cluster

In [6]:
def predict_news_cluster(news_text):
    input_vector = text_vectorizer.transform([news_text])
    predicted_cluster = news_clusters.predict(input_vector)[0]
    predicted_category = news_df.iloc[predicted_cluster]['Category']
    return predicted_cluster, predicted_category

# cluster prediction

In [7]:
window = tk.Tk()
window.title("News Clustering")
window.minsize(600, 400) 

news_textbox = ScrolledText(window)
news_textbox.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

predicted_cluster_label = tk.Label(window, text="Predicted Cluster:")
predicted_cluster_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")

predicted_category_label = tk.Label(window, text="Predicted Category:")  
predicted_category_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")

def predict_and_display():
    news_text = news_textbox.get("1.0", tk.END)   
    predicted_cluster, predicted_category = predict_news_cluster(news_text)
    predicted_cluster_label.config(text=f"Predicted Cluster: {predicted_cluster}") 
    predicted_category_label.config(text=f"Predicted Category: {predicted_category}")
    
    print("Predicted Cluster number :", predicted_cluster)
    print("Predicted Cluster Category:", predicted_category)
      
# Tkinter GUI components
predict_button = tk.Button(window, text="Predict Cluster", command=predict_and_display)
predict_button.grid(row=3, column=0, padx=5, pady=5)
   
window.mainloop()

Predicted Cluster number : 0
Predicted Cluster Category: sports
Predicted Cluster number : 1
Predicted Cluster Category: business
