In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load the CSV file
file_path = 'result.csv'  # Update this to the path of your CSV file
df = pd.read_csv(file_path)

# Preprocessing function
def preprocess_text(df):
    # Fill missing values
    df['short_description'] = df['short_description'].fillna('')
    df['headline'] = df['headline'].fillna('')
    df['category'] = df['category'].fillna('Unknown')  # Handle missing categories
    return df

# Preprocess the fetched data
if not df.empty:
    df = preprocess_text(df)
else:
    raise ValueError("The DataFrame is empty. Please check the CSV file.")

# Prompt the user for input
user_category = input("Enter the category of articles you are interested in: ")

# Filter the DataFrame based on the user input category
category_df = df[df['category'].str.contains(user_category, case=False, na=False)].copy()

if category_df.empty:
    raise ValueError(f"No articles found in the category '{user_category}'.")

# Combine the 'headline' and 'short_description' for TF-IDF vectorization
category_df['text'] = category_df['headline'] + " " + category_df['short_description']

# Reset the index to ensure the indices match
category_df.reset_index(drop=True, inplace=True)

# Vectorize the data with sparse matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(category_df['text'])
tfidf_matrix = csr_matrix(tfidf_matrix)

# Compute cosine similarity with sparse matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define the recommendation function
def get_recommendations(df, cosine_sim, num_recommendations=5):
    recommendations = {}
    for i, row in df.iterrows():
        idx = i
        # Get the pairwise similarity scores of all articles with that article
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the articles based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the most similar articles
        sim_scores = sim_scores[1:num_recommendations+1]

        # Get the article indices
        article_indices = [i[0] for i in sim_scores]

        # Store the most similar articles
        recommendations[row['headline']] = df['headline'].iloc[article_indices].tolist()
    
    return recommendations

# Get recommendations based on the filtered category DataFrame
recommendations = get_recommendations(category_df, cosine_sim)

print("Recommended Articles based on the category:")
for headline, recs in recommendations.items():
    print(f"\nArticles similar to '{headline}':")
    for rec in recs:
        print(f"  - {rec}")


Enter the category of articles you are interested in: entertainment
Recommended Articles based on the category:

Articles similar to 'Golden Globes Returning To NBC In January After Year Off-Air':
  - Idris Elba Reveals How He 'Hustled' His Way Onto Jay-Z's 'American Gangster' Album
  - Rock Legend Carlos Santana Collapses On Stage
  - Olivia Rodrigo Brings Out Lily Allen For 'F**k You' Duet Aimed At Anti-Abortion Justices
  - Crosby, Stills & Nash's Music Returns To Spotify After Joe Rogan Protest
  - Andrew Garfield Confirms Method Acting Is Possible Without 'Being An Asshole'

Articles similar to 'James Cameron Says He 'Clashed' With Studio Before 'Avatar' Release':
  - James Cameron Presents New Scenes From 'Avatar: The Way Of Water' At D23 Expo
  - 'The Flash' Movie Still Set For Release Despite Ezra Miller's Controversies
  - Jean-Luc Godard, Pioneering French Filmmaker, Dies
  - Viola Davis Feared A Heart Attack During 'The Woman King' Training
  - John Legend Says Kanye's Suppo