# Importing Libraries

In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# Downloading needed Language Toolkit

Used for cleaning and analysing the text by :

1.   removing words like this..on..in etc
2.   splitting sentances into wordz
3.   converting words to their dictionary form



In [45]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Scraping News

Function to scrape multiple pages of news from various sites

In [46]:
def scrape_news_multiple_pages(base_url, category, class_name, pages=10):
    news_data = []
    for page in range(1, pages + 1):
        url = f"{base_url}/page/{page}" if "indiatoday" in base_url else base_url
        print(f"Scraping {category}, Page {page}...")
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            headlines = soup.find_all('span', {'class': class_name})
            for headline in headlines:
                title = headline.get_text().strip()  # Extract the text of the headline
                link = headline.find_parent('a')['href'] if headline.find_parent('a') else None  # Get the link
                if link:
                    full_link = f'https://edition.cnn.com{link}' if link.startswith('/') else link
                    news_data.append({'title': title, 'link': full_link, 'category': category})
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return news_data

Defining categories nd their urls || collectinfg data for all categroies and saving it to all_news

In [47]:
categories = {
    'Sports': {'url': 'https://edition.cnn.com/sport', 'class_name': 'container__headline-text'},
    'Tech': {'url': 'https://edition.cnn.com/business/tech', 'class_name': 'container__headline-text'},
    'Business': {'url': 'https://edition.cnn.com/business', 'class_name': 'container__headline-text'},
    'Health': {'url': 'https://edition.cnn.com/health', 'class_name': 'container__headline-text'},
    'Entertainment': {'url': 'https://edition.cnn.com/entertainment', 'class_name': 'container__headline-text'},
    'Politics': {'url': 'https://edition.cnn.com/politics', 'class_name': 'container__headline-text'},
    'World News': {'url': 'https://edition.cnn.com/world', 'class_name': 'container__headline-text'},  # New category
    'Lifestyle': {'url': 'https://edition.cnn.com/style', 'class_name': 'container__headline-text'},  # New category
}

all_news = []
for category, info in categories.items():
    news = scrape_news_multiple_pages(info['url'], category, info['class_name'], pages=10 if category == 'Tech' else 5)
    all_news.extend(news)

Scraping Sports, Page 1...
Scraping Sports, Page 2...
Scraping Sports, Page 3...
Scraping Sports, Page 4...
Scraping Sports, Page 5...
Scraping Tech, Page 1...
Scraping Tech, Page 2...
Scraping Tech, Page 3...
Scraping Tech, Page 4...
Scraping Tech, Page 5...
Scraping Tech, Page 6...
Scraping Tech, Page 7...
Scraping Tech, Page 8...
Scraping Tech, Page 9...
Scraping Tech, Page 10...
Scraping Business, Page 1...
Scraping Business, Page 2...
Scraping Business, Page 3...
Scraping Business, Page 4...
Scraping Business, Page 5...
Scraping Health, Page 1...
Scraping Health, Page 2...
Scraping Health, Page 3...
Scraping Health, Page 4...
Scraping Health, Page 5...
Scraping Entertainment, Page 1...
Scraping Entertainment, Page 2...
Scraping Entertainment, Page 3...
Scraping Entertainment, Page 4...
Scraping Entertainment, Page 5...


Convert the collected data into a DataFrame

In [48]:
df = pd.DataFrame(all_news)
print(f"Total news articles collected: {len(df)}")
print(df.head())

Total news articles collected: 1250
                                               title  \
0  ‘Money excuse must stop’: Thibaut Courtois urg...   
1  2024 NFL playoff picture: Who’s clinched and w...   
2  Gabriela Dabrowski, who won an Olympic medal a...   
3  College Football Playoff: The biggest names lo...   
4  Magnus Carlsen to rejoin World Blitz Chess Cha...   

                                                link category  
0  https://edition.cnn.com/2024/12/31/sport/thiba...   Sports  
1  https://edition.cnn.com/2024/12/31/sport/nfl-p...   Sports  
2  https://edition.cnn.com/2024/12/31/sport/gabri...   Sports  
3  https://edition.cnn.com/2024/12/31/sport/colle...   Sports  
4  https://edition.cnn.com/2024/12/30/sport/magnu...   Sports  


# Data Pre-Processing

PreProcessing Fn:

In [49]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = tokenizer.tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    return ' '.join(tokens)

PreProcessing Application

In [50]:
# Apply preprocessing
df['processed_title'] = df['title'].apply(preprocess_text)

# Feature Extraction
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 3))
X_tfidf = vectorizer.fit_transform(df['processed_title'])

# Prepare labels for training
category_map = {category: idx for idx, category in enumerate(categories.keys())}
df['category_label'] = df['category'].map(category_map)

# Balance the dataset
min_samples = df['category_label'].value_counts().min()
balanced_df = df.groupby('category_label').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)

  balanced_df = df.groupby('category_label').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)


## Split dataset into training and testing sets

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    vectorizer.transform(balanced_df['processed_title']),
    balanced_df['category_label'],
    test_size=0.2,
    random_state=42)

# Training The Model (logistc Regrsn)

In [52]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

Evaluation of model

In [53]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=category_map.keys(), labels=list(category_map.values())))

# Hyperparameter tuning with GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=500), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

Logistic Regression Accuracy: 0.9394
Classification Report:
               precision    recall  f1-score   support

       Sports       0.97      1.00      0.99        33
         Tech       0.85      0.88      0.86        32
     Business       0.88      0.82      0.85        34
       Health       1.00      1.00      1.00        29
Entertainment       1.00      1.00      1.00        37

     accuracy                           0.94       165
    macro avg       0.94      0.94      0.94       165
 weighted avg       0.94      0.94      0.94       165

Best Parameters: {'C': 10}


# Saving Model And Vectorizer

In [54]:
joblib.dump(best_model, 'news_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

# Model Testing !!!

Replace test headlines array with news headline you want to predict

In [55]:
def test_model(headlines):
    loaded_model = joblib.load('news_classifier_model.pkl')
    loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

    processed_headlines = [preprocess_text(headline) for headline in headlines]
    X_new = loaded_vectorizer.transform(processed_headlines)
    predictions = loaded_model.predict(X_new)

    # Map predictions back to category names
    category_map_reverse = {v: k for k, v in category_map.items()}
    predicted_categories = [category_map_reverse[pred] for pred in predictions]

    for headline, category in zip(headlines, predicted_categories):
        print(f"Headline: {headline}")
        print(f"Predicted Category: {category}\n")

# Example test
test_headlines = [
    "Apple releases new iPhone with groundbreaking features",
    "India wins the cricket World Cup in a thrilling final",
    "Netflix releases its most-watched series of the year",
    "Stock market hits an all-time high as companies report profits",
    "Doctors warn about a new virus outbreak",
    "New Marvel movie breaks box office records",
    "Presidential election results bring shock to the nation",
    "5 healthy habits to improve your lifestyle in 2025",
    "World leaders meet to discuss global economic reforms"
]
test_model(test_headlines)

Headline: Apple releases new iPhone with groundbreaking features
Predicted Category: Entertainment

Headline: India wins the cricket World Cup in a thrilling final
Predicted Category: Sports

Headline: Netflix releases its most-watched series of the year
Predicted Category: Sports

Headline: Stock market hits an all-time high as companies report profits
Predicted Category: Business

Headline: Doctors warn about a new virus outbreak
Predicted Category: Health

Headline: New Marvel movie breaks box office records
Predicted Category: Entertainment



Accuracy Score and Classifiction Report

In [56]:
# Evaluate on Training Data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Compare with Test Accuracy
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed Classification Reports for Training and Test Data
print("\nClassification Report (Training Data):")
print(classification_report(y_train, y_train_pred, target_names=category_map.keys(), labels=list(category_map.values())))

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred, target_names=category_map.keys(), labels=list(category_map.values())))


Training Accuracy: 0.9515
Test Accuracy: 0.9394

Classification Report (Training Data):
               precision    recall  f1-score   support

       Sports       1.00      1.00      1.00       132
         Tech       0.83      0.96      0.89       133
     Business       0.96      0.83      0.89       131
       Health       1.00      0.96      0.98       136
Entertainment       1.00      1.00      1.00       128

     accuracy                           0.95       660
    macro avg       0.96      0.95      0.95       660
 weighted avg       0.96      0.95      0.95       660


Classification Report (Test Data):
               precision    recall  f1-score   support

       Sports       0.97      1.00      0.99        33
         Tech       0.85      0.88      0.86        32
     Business       0.88      0.82      0.85        34
       Health       1.00      1.00      1.00        29
Entertainment       1.00      1.00      1.00        37

     accuracy                           0.94  

# Saving DataSet into CSV

In [57]:
df.to_csv("news_dataset.csv", index=False)
print("Dataset saved as 'news_dataset.csv'")


Dataset saved as 'news_dataset.csv'
