# **DATA FETCHING**

In [1]:
import requests

def fetch_news(api_key):
    url = f'https://newsapi.org/v2/top-headlines?country=us&apiKey={api_key}'
    response = requests.get(url)
    data = response.json()

    articles = []
    for article in data['articles']:
        articles.append({'title': article['title'], 'content': article['description']})

    return articles

# Insert your API key here
api_key = '2bb8b3db5aa248228b5b197119330502'
articles = fetch_news(api_key)


In [2]:
print(articles)

[{'title': 'Family of Corey Comperatore, Trump rallygoer shot dead, struggles with loss - BBC.com', 'content': 'Corey Comperatore was killed in the Trump rally assassination attempt. His wife is furious at the security failures that led to his death.'}, {'title': 'EU executive to adopt tariffs on Chinese EVs after vote - Reuters', 'content': "The European Union will press ahead with hefty tariffs on China-made electric vehicles, the EU executive said on Friday, even after the bloc's largest economy Germany rejected them, exposing a rift over its biggest trade row with Beijing in a decade."}, {'title': 'ChatGPT’s ‘canvas’ interface makes it easier to write and code - The Verge', 'content': 'OpenAI has launched a new “canvas” interface for ChatGPT that allows users to adjust sections of text or code generated by the chatbot without a full rewrite.'}, {'title': 'Rocket Report: Falcon 9 second stage stumbles; Japanese rocket nears the end - Ars Technica', 'content': '“I’m pretty darn confi

# **DATA PREPROCESSING**

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha()]  # Lemmatization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

preprocessed_articles = [preprocess_text(article['content']) for article in articles]


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/bhavya/nltk_data'
    - '/Users/bhavya/Desktop/NLP PROJECT/NEWS RECOMMENDATION/venv/nltk_data'
    - '/Users/bhavya/Desktop/NLP PROJECT/NEWS RECOMMENDATION/venv/share/nltk_data'
    - '/Users/bhavya/Desktop/NLP PROJECT/NEWS RECOMMENDATION/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [4]:
print(preprocessed_articles)

['corey comperatore wa killed trump rally assassination attempt wife furious security failure led death', 'european union press ahead hefty tariff electric vehicle eu executive said friday even bloc largest economy germany rejected exposing rift biggest trade row beijing decade', 'openai ha launched new canvas interface chatgpt allows user adjust section text code generated chatbot without full rewrite', 'pretty darn confident going good day', 'removed', 'striker celebrating tentative agreement raise pay next six year', 'economy added job september sign labor market robustly healthy unemployment rate fell follow along live update stock bond market including dow jones industrial average p nasdaq', 'former president barack obama plan commence campaign sprint vice president kamala harris next week pennsylvania adviser democratic presidential nominee campaign said hoping star power among democrat help propel', 'follow florida today space team live update morning ula vulcan mission launch c

# **Feature Extraction**

TF-IDF:

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(preprocessed_articles)


Word2Vec:

In [6]:
from gensim.models import Word2Vec

# Tokenize the text for Word2Vec
tokenized_articles = [article.split() for article in preprocessed_articles]
word2vec_model = Word2Vec(tokenized_articles, vector_size=100, window=5, min_count=2, workers=4)


# **Topic Modelling using LDA**

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_topics = lda.fit_transform(X_tfidf)

# Display the topics
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


Topic 0:
['expert', 'overt', 'order', 'doj', 'close', 'case', 'political', 'court', 'policy', 'advises']
Topic 1:
['said', 'next', 'celebrating', 'striker', 'six', 'raise', 'agreement', 'pay', 'tentative', 'year']
Topic 2:
['economist', 'significantly', 'exceeding', 'report', 'expectation', 'broader', 'nonfarm', 'payroll', 'campaign', 'president']
Topic 3:
['healthy', 'industrial', 'bond', 'stock', 'nasdaq', 'labor', 'unemployment', 'job', 'including', 'market']
Topic 4:
['launch', 'today', 'mission', 'ula', 'force', 'death', 'space', 'confirmed', 'far', 'least']
Topic 5:
['also', 'said', 'weirdest', 'cnn', 'anchor', 'eventually', 'promo', 'ever', 'seen', 'beirut']
Topic 6:
['economy', 'next', 'said', 'pretty', 'day', 'good', 'going', 'darn', 'confident', 'removed']
Topic 7:
['israel', 'zone', 'buffer', 'warned', 'evacuate', 'community', 'came', 'outside', 'united', 'fantasy']
Topic 8:
['corey', 'assassination', 'comperatore', 'rally', 'failure', 'attempt', 'killed', 'furious', 'led', 

In [11]:
print("Interaction matrix shape:", interaction_matrix.shape)


Interaction matrix shape: (3, 4)


In [12]:
import numpy as np
import pandas as pd

# Example User-Article interaction matrix
data = {'user_id': [1, 1, 2, 2, 3, 3], 'article_id': [101, 102, 101, 103, 102, 104], 'interaction': [1, 1, 1, 1, 1, 1]}
df_interactions = pd.DataFrame(data)

# Create user-item interaction matrix
interaction_matrix = df_interactions.pivot(index='user_id', columns='article_id', values='interaction').fillna(0).values

# Apply TruncatedSVD for Matrix Factorization
from sklearn.decomposition import TruncatedSVD

n_components = min(3, interaction_matrix.shape[1])  # Setting n_components to 3
svd = TruncatedSVD(n_components=n_components, random_state=42)
collaborative_features = svd.fit_transform(interaction_matrix)



In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between the articles
content_similarity = cosine_similarity(X_tfidf)


In [14]:
def hybrid_recommendation(user_id, user_articles, collaborative_features, content_similarity):
    # Collaborative Filtering: Based on user similarity
    collaborative_recommendations = collaborative_features[user_id]

    # Content-Based Filtering: Based on article similarity
    content_recommendations = np.mean([content_similarity[article] for article in user_articles], axis=0)

    # Fusion Layer: Combine both collaborative and content-based filtering
    final_recommendations = (collaborative_recommendations + content_recommendations) / 2
    return np.argsort(final_recommendations)[::-1]


In [15]:
from sklearn.metrics import precision_score, recall_score

def evaluate_recommendation(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    return precision, recall

# Example usage:
true_labels = [1, 0, 1, 0]  # True interaction labels
predicted_labels = [1, 1, 1, 0]  # Predicted labels from recommendation system

precision, recall = evaluate_recommendation(true_labels, predicted_labels)
print(f"Precision: {precision}, Recall: {recall}")


Precision: 0.8333333333333333, Recall: 0.75


In [18]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/recommend', methods=['POST'])
def recommend():
    user_data = request.json
    user_id = user_data['user_id']
    user_articles = user_data['articles']

    recommendations = hybrid_recommendation(user_id, user_articles, collaborative_features, content_similarity)
    return jsonify({'recommendations': recommendations.tolist()})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
import streamlit as st

st.title("Personalized News Recommendation System")

user_input = st.text_input("Enter your user ID:")
if st.button('Get Recommendations'):
    user_id = int(user_input)
    user_articles = [101, 102]  # Example: Articles the user has interacted with

    recommendations = hybrid_recommendation(user_id, user_articles, collaborative_features, content_similarity)
    st.write(f"Recommended Articles: {recommendations[:5]}")
