<a href="https://colab.research.google.com/github/arunvithyasegar/News_sentiment_Analysis/blob/main/News_Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News Sentiment Analysis Project
## Part 2: Web Scraping & Sentiment Analysis Using Publicly Accessible News Sources

This notebook performs sentiment analysis on news headlines related to electronics, semiconductors and manufacturing. The analysis follows these main steps:

1. Web scraping from public news sources
2. Sentiment analysis using VADER
3. Interactive visualization of results

Click cells and run them sequentially to perform the analysis.

# **Importing Essentials**

In [None]:
!pip install requests pandas nltk textblob feedparser plotly newspaper3k pycountry seaborn matplotlib --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l

In [None]:
# @title Importing Libraries
import requests
import pandas as pd
import re
from datetime import datetime
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import feedparser
import pycountry
import seaborn as sns
import matplotlib.pyplot as plt
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

# **Set up API and base Variables**

In [3]:
# @title Assigining Key Words for search
keywords = ['electronics', 'semiconductor', 'manufacturing', 'chip', 'technology']
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
news_data = []
newsapi_key = "pub_86076086703c94c2637e240672a4a90a30ad9"

# Create a list of country names for location extraction
country_list = [country.name for country in pycountry.countries]

In [4]:
# @title Defining a function to extract country mentions from text
def extract_countries(text):
    """Find country names mentioned in the text"""
    found_countries = []
    for country in country_list:
        if re.search(r'\b' + re.escape(country) + r'\b', text, re.IGNORECASE):
            found_countries.append(country)
    return ', '.join(found_countries) if found_countries else 'Not specified'

## Task 1: Web Scraping
Collecting news headlines from Google News RSS and NewsData.io API

In [None]:
# @title Scrapeing news from Google News RSS feed
print("Scraping Google News RSS feeds...")
google_news = []

def fetch_news(source, keywords, limit=20):
    """Fetch news with better error handling"""
    try:
        if source == "Google News":
            # Clean keywords and create encoded query
            cleaned_keywords = [k.strip() for k in keywords]
            query = "+".join(cleaned_keywords)
            feed_url = f'https://news.google.com/rss/search?q={query}+business&hl=en-US&gl=US&ceid=US:en'
            
            feed = feedparser.parse(feed_url)
            articles = []
            
            for entry in feed.entries[:limit]:
                articles.append({
                    'title': entry.title,
                    'url': entry.link,
                    'timestamp': datetime.strptime(
                        entry.published, 
                        '%a, %d %b %Y %H:%M:%S %Z'
                    ).strftime('%Y-%m-%d %H:%M:%S'),
                    'source': 'Google News'
                })
                time.sleep(0.1)  # Polite delay
            
            return articles
    except Exception as e:
        print(f"Error fetching news: {str(e)}")
        return []

google_news = fetch_news("Google News", keywords, limit=5)
print(f"Collected {len(google_news)} articles from Google News")

Scraping Google News RSS feeds...
Collected 25 articles from Google News


In [6]:
# @title Connecting and Collecting news from NewsAPI
print("Getting news from NewsAPI...")
newsapi_articles = []

# NewsAPI endpoint for the free plan
url = f'https://newsdata.io/api/1/news?apikey={newsapi_key}&q=electronics OR semiconductor OR manufacturing&language=en&category=business,technology'

response = requests.get(url)
if response.status_code == 200:
    data = response.json()
    articles = data.get('results', [])

    for article in articles:
        if article.get('title') and article.get('link'):
            newsapi_articles.append({
                'title': article['title'],
                'url': article['link'],
                'timestamp': article.get('pubDate', 'Unknown'),
                'source': article.get('source_id', 'NewsAPI')
            })

print(f"Collected {len(newsapi_articles)} articles from NewsAPI")

Getting news from NewsAPI...
Collected 10 articles from NewsAPI


# **Feature Engineering**

In [7]:
# @title Combining all news sources and remove duplicates
print("Combining all news sources...")
all_articles = google_news + newsapi_articles

# Remove duplicates based on title
unique_articles = []
unique_titles = set()

for article in all_articles:
    if article['title'] not in unique_titles:
        unique_titles.add(article['title'])
        unique_articles.append(article)
# Limit to 20 articles
final_articles = unique_articles[:20]
print(f"Selected {len(final_articles)} unique articles for analysis")

Combining all news sources...
Selected 20 unique articles for analysis


In [8]:
# @title Extracting country mentions and create the final dataset
print("Extracting country mentions...")
for article in final_articles:
    # Extract country information
    countries = extract_countries(article['title'])

    # Add to news data
    news_data.append({
        'title': article['title'],
        'url': article['url'],
        'timestamp': article['timestamp'],
        'source': article.get('source', 'Unknown'),
        'country': countries
    })
# Create a pandas DataFrame
news_df = pd.DataFrame(news_data)
print("News data collected and organized into DataFrame")

Extracting country mentions...
News data collected and organized into DataFrame


## Task 2: Sentiment Analysis 
Analyzing sentiment of collected headlines using VADER

In [None]:
# @title Setting up and perform sentiment analysis
print("Performing sentiment analysis...")
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment category based on compound score
def get_sentiment_category(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Analyze each headline
news_df['sentiment_score'] = news_df['title'].apply(lambda x: sid.polarity_scores(x)['compound'])
news_df['sentiment'] = news_df['sentiment_score'].apply(get_sentiment_category)

print("Sentiment analysis complete")
print(f"Sentiment distribution: Positive: {sum(news_df['sentiment'] == 'Positive')}, " +
      f"Neutral: {sum(news_df['sentiment'] == 'Neutral')}, " +
      f"Negative: {sum(news_df['sentiment'] == 'Negative')}")

Performing sentiment analysis...
Sentiment analysis complete
Sentiment distribution: Positive: 2, Neutral: 14, Negative: 4


## Interactive Results & Visualizations
Explore the results through interactive charts and tables

In [None]:
# @title collected News data
print("Displaying collected news data:")
news_df

Displaying collected news data:


Unnamed: 0,title,url,timestamp,source,country,sentiment_score,sentiment
0,Electronics company posts bill with over $36K ...,https://news.google.com/rss/articles/CBMiowFBV...,2025-05-09 20:06:34,Google News,Not specified,0.0,Neutral
1,"Samsung | History, Consumer Products, Leadersh...",https://news.google.com/rss/articles/CBMiYkFVX...,2025-05-08 07:13:00,Google News,Not specified,0.0,Neutral
2,DuPont Announces CEO and Non-Executive Chair f...,https://news.google.com/rss/articles/CBMiwwFBV...,2025-03-17 10:50:10,Google News,Not specified,0.0,Neutral
3,Samsung Electronics says unit Harman acquires ...,https://news.google.com/rss/articles/CBMi1AFBV...,2025-05-07 03:31:04,Google News,Not specified,0.0,Neutral
4,DuPont Files Form 10 To Spin Off Electronics B...,https://news.google.com/rss/articles/CBMingFBV...,2025-04-25 07:00:00,Google News,Not specified,0.0,Neutral
5,A timeline of the U.S. semiconductor market in...,https://news.google.com/rss/articles/CBMijgFBV...,2025-05-10 14:00:00,Google News,Not specified,0.0,Neutral
6,Huawei and other Chinese chip firms are catchi...,https://news.google.com/rss/articles/CBMipAFBV...,2025-05-08 13:01:04,Google News,Not specified,0.0,Neutral
7,Jim Cramer Says Taiwan Semiconductor Manufactu...,https://news.google.com/rss/articles/CBMiigFBV...,2025-05-09 14:23:31,Google News,Not specified,-0.2263,Negative
8,US Chip Tariffs: How Might Europe Respond? - C...,https://news.google.com/rss/articles/CBMidEFVX...,2025-05-09 16:18:05,Google News,Not specified,0.0,Neutral
9,A tale of two cities: Korea’s red tape slows S...,https://news.google.com/rss/articles/CBMi_AFBV...,2025-05-09 07:29:56,Google News,Not specified,0.0,Neutral


In [None]:
# @title A bar chart for sentiment distribution using matplotlib
def plot_sentiment_distribution(df):
    """Create static sentiment distribution chart"""
    sentiment_counts = df['sentiment'].value_counts()
    
    # Set style
    plt.style.use('seaborn')
    
    # Create figure
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Color mapping
    colors = {'Positive': '#4CAF50', 'Neutral': '#2196F3', 'Negative': '#F44336'}
    bar_colors = [colors[s] for s in sentiment_counts.index]
    
    # Create bars
    bars = ax.bar(sentiment_counts.index, sentiment_counts.values, color=bar_colors)
    
    # Customize chart
    ax.set_title('Sentiment Distribution of News Headlines', fontsize=14, pad=15)
    ax.set_xlabel('Sentiment Category', fontsize=12)
    ax.set_ylabel('Number of Headlines', fontsize=12)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    # Show grid
    ax.grid(True, axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    return fig

# Create and display chart
sentiment_fig = plot_sentiment_distribution(news_df)
plt.show()

Creating sentiment distribution chart...


In [None]:
# @title Table view of sentiment analysis results
def create_analysis_table(df):
    """Create formatted table of results"""
    # Calculate statistics
    total = len(df)
    sentiment_counts = df['sentiment'].value_counts()
    sentiment_pcts = (sentiment_counts / total * 100).round(1)
    
    # Create summary table
    print("\nSentiment Analysis Results")
    print("=" * 40)
    print(f"\nTotal Headlines Analyzed: {total}\n")
    print("Distribution by Sentiment:")
    print("-" * 30)
    
    for sentiment in ['Positive', 'Neutral', 'Negative']:
        if sentiment in sentiment_counts:
            count = sentiment_counts[sentiment]
            pct = sentiment_pcts[sentiment]
            print(f"{sentiment:8s}: {count:3d} ({pct:5.1f}%)")

# Display results table
create_analysis_table(news_df)

# Display detailed results
print("\nDetailed Results:")
print("-" * 100)
print(news_df[['title', 'sentiment', 'sentiment_score', 'source']].to_string())

Creating interactive news table...


In [None]:
# @title Combined visualization
def create_dashboard(df):
    """Create comprehensive dashboard with multiple visualizations"""
    # Set up the figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Colors
    colors = {'Positive': '#4CAF50', 'Neutral': '#2196F3', 'Negative': '#F44336'}
    
    # 1. Sentiment Distribution
    sentiment_counts = df['sentiment'].value_counts()
    bar_colors = [colors[s] for s in sentiment_counts.index]
    bars = ax1.bar(sentiment_counts.index, sentiment_counts.values, color=bar_colors)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom')
    
    ax1.set_title('Sentiment Distribution')
    ax1.grid(True, axis='y', alpha=0.3)
    
    # 2. Source Distribution
    source_counts = df['source'].value_counts()
    ax2.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
    ax2.set_title('Distribution by Source')
    
    plt.tight_layout()
    return fig

# Create and display dashboard
dashboard_fig = create_dashboard(news_df)
plt.show()

Creating dashboard...


In [None]:
# @title Save the data to CSV and HTML files for future reference
news_df.to_csv("scraped_news_data.csv", index=False)
sentiment_fig.savefig("sentiment_distribution.png")
dashboard_fig.savefig("news_dashboard.png")

print("Analysis complete! Results saved to CSV and PNG files.")

Analysis complete! Results saved to CSV and HTML files.


## Professional Analysis Summary
Key findings and insights from the sentiment analysis

In [None]:
# Generate professional summary
def generate_summary(df):
    """Create executive summary of analysis results"""
    total = len(df)
    sentiment_counts = df['sentiment'].value_counts()
    
    print("News Sentiment Analysis Summary")
    print("=" * 30)
    print(f"\nTotal Articles Analyzed: {total}")
    print("\nSentiment Distribution:")
    for sentiment, count in sentiment_counts.items():
        percentage = (count/total) * 100
        print(f"- {sentiment}: {count} articles ({percentage:.1f}%)")
    
    print("\nTop News Sources:")
    for source, count in df['source'].value_counts().head(3).items():
        print(f"- {source}: {count} articles")
    
    avg_score = df['sentiment_score'].mean()
    print(f"\nAverage Sentiment Score: {avg_score:.3f}")
    
    if avg_score >= 0.05:
        trend = "Positive"
    elif avg_score <= -0.05:
        trend = "Negative"
    else:
        trend = "Neutral"
    print(f"Overall Market Trend: {trend}")

# Display summary
generate_summary(news_df)

In [None]:
# Improved visualization function
def create_visualization(df):
    """Create professional visualization with error handling"""
    try:
        # Sentiment distribution
        sentiment_counts = df['sentiment'].value_counts()
        
        # Create figure with subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Color scheme
        colors = {'Positive': '#4CAF50', 'Neutral': '#2196F3', 'Negative': '#F44336'}
        
        # Bar plot
        sns.barplot(
            x=sentiment_counts.index,
            y=sentiment_counts.values,
            palette=colors,
            ax=ax1
        )
        ax1.set_title('Sentiment Distribution')
        ax1.set_ylabel('Number of Articles')
        
        # Add value labels
        for i, v in enumerate(sentiment_counts.values):
            ax1.text(i, v, str(v), ha='center', va='bottom')
        
        # Timeline plot
        df['datetime'] = pd.to_datetime(df['timestamp'])
        sns.scatterplot(
            data=df,
            x='datetime',
            y='sentiment_score',
            hue='sentiment',
            palette=colors,
            ax=ax2
        )
        ax2.set_title('Sentiment Timeline')
        ax2.set_xlabel('Publication Date')
        ax2.set_ylabel('Sentiment Score')
        
        plt.tight_layout()
        return fig
    except Exception as e:
        print(f"Error creating visualization: {str(e)}")
        return None