# Solution: News Headlines Feature Extraction & Similarity

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Sample news headlines
headlines = [
    "AI outperforms doctors in diagnosing rare diseases",
    "Stock markets hit new record highs amid global optimism",
    "New vaccine shows promise in early trials",
    "Climate change impacts agriculture across multiple continents",
    "Scientists develop biodegradable plastic from seaweed",
    "Sports teams adapt strategies with big data analytics",
    "Electric vehicles set new sales record worldwide",
    "Breakthrough in quantum computing boosts encryption security"
]

# 1. Preprocessing function
def simple_preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

processed_headlines = [simple_preprocess(item) for item in headlines]

# 2. Bag-of-Words vectorization
vectorizer = CountVectorizer(
    stop_words='english',   # Remove common English stop words
    max_features=50         # Limit vocab size
)
bow_matrix = vectorizer.fit_transform(processed_headlines)
feature_names = vectorizer.get_feature_names_out()

print("BoW Vocabulary:", feature_names)
print("Matrix shape:", bow_matrix.shape)
print("Matrix density:", bow_matrix.nnz / (bow_matrix.shape[0] * bow_matrix.shape[1]))

# 3. Word frequencies and visualization
word_freq = bow_matrix.sum(axis=0).A1
word_freq_df = pd.DataFrame({'word': feature_names, 'freq': word_freq}).sort_values('freq', ascending=False)
print("\nTop 10 most frequent words:\n", word_freq_df.head(10))

# Plot
plt.figure(figsize=(8, 4))
plt.bar(word_freq_df.head(10)['word'], word_freq_df.head(10)['freq'], color="skyblue")
plt.title("Top 10 Most Frequent Words in News Headlines")
plt.xlabel("Word")
plt.ylabel("Frequency")
plt.xticks(rotation=30)
plt.show()

# 4. Document similarity
similarity = cosine_similarity(bow_matrix)
sim_df = pd.DataFrame(similarity, index=[f"Headline {i+1}" for i in range(len(headlines))], columns=[f"Headline {i+1}" for i in range(len(headlines))])
print("\nCosine Similarity Matrix (rounded):\n", sim_df.round(2))

# Identify most similar pair (excluding self-similarity)
most_sim_idx = np.dstack(np.unravel_index(np.argsort(similarity.ravel()), similarity.shape))[0][::-1]
for i, j in most_sim_idx:
    if i != j:
        print(f"\nMost similar headlines are {i+1} and {j+1} with similarity {similarity[i,j]:.2f}")
        print("==>", headlines[i])
        print("==>", headlines[j])
        break
