<a href="https://colab.research.google.com/github/ayush-dudhani/Web-Mining/blob/main/webmining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

In [None]:
def scrape_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        h1_element = soup.find('h1', {'class': 'native_story_title'})
        if h1_element:
            text = h1_element.get_text()
            return text
    return None

In [None]:
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment

In [None]:
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
def read_json_data(file_path):
    data = []
    with open(file_path, 'r') as json_file:
        for line in json_file:
            entry = json.loads(line)
            headline = entry.get('headline', '')
            category = entry.get('category', '')
            data.append((headline, category))
    return data

In [None]:
def classify_headline(headline, tfidf_vectorizer, classifier):
    headline_tfidf = tfidf_vectorizer.transform([headline])
    predicted_category = label_encoder.inverse_transform(classifier.predict(headline_tfidf))
    return predicted_category[0]

In [None]:
url = 'https://indianexpress.com/article/india/andhra-pradesh-train-accident-live-updates-9004874/'
# url = 'https://indianexpress.com/article/explained/explained-economics/3rd-largest-economy-in-3rd-modi-term-anatomy-of-indias-growth-8862361/'
# url = 'https://indianexpress.com/article/political-pulse/today-in-politics-kerala-blasts-pinarayi-vijayan-shinde-sena-mp-resigns-maratha-quota-9005188/'
text_content = scrape_webpage(url)

if text_content:
    sentiment_scores = analyze_sentiment(text_content)
    sentiment_label = get_sentiment_label(sentiment_scores['compound'])

    print("Scraped Text:")
    print(text_content)
    print("Sentiment Scores:", sentiment_scores)
    print("Sentiment Label:", sentiment_label)
else:
    print("Failed to retrieve the web page.")

dataset_path = '/content/drive/MyDrive/News_Category_Dataset_v3.json'
json_data = read_json_data(dataset_path)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform([headline for headline, _ in json_data])

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform([category for _, category in json_data])

classifier = MultinomialNB()
classifier.fit(X_tfidf, y_encoded)

predicted_category = classify_headline(text_content, tfidf_vectorizer, classifier)

print()
print("News Headline:", text_content)
print("Predicted Category:", predicted_category)

Scraped Text:
13 dead as two passenger trains collide in Andhra, officials say driver missed red signal
Sentiment Scores: {'neg': 0.394, 'neu': 0.606, 'pos': 0.0, 'compound': -0.7783}
Sentiment Label: Negative

News Headline: 13 dead as two passenger trains collide in Andhra, officials say driver missed red signal
Predicted Category: CRIME
