<a href="https://colab.research.google.com/github/arunvithyasegar/News_sentiment_Analysis/blob/main/News_Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News Sentiment Analysis Project
## Part 2: Web Scraping & Sentiment Analysis Using Publicly Accessible News Sources

This notebook performs sentiment analysis on news headlines related to electronics, semiconductors and manufacturing. The analysis follows these main steps:

1. Web scraping from public news sources
2. Sentiment analysis using VADER
3. Interactive visualization of results

Click cells and run them sequentially to perform the analysis.

# [Click the link to View Dashboard on streamlit](https://newssentimentanalysis.streamlit.app/)
### PS turn off Dark Mode on your Browser

https://electronictradedashboard.streamlit.app/


In [1]:
!pip install requests pandas nltk textblob feedparser plotly newspaper3k pycountry --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l

In [2]:
# @title Importing Essentials
import requests
import pandas as pd
import re
from datetime import datetime
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import feedparser
import pycountry
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

# **Set up API and base Variables**

In [3]:
# @title Assigining Key Words for search
keywords = ['electronics', 'semiconductor', 'manufacturing', 'chip', 'technology']
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
news_data = []
newsapi_key = "pub_86076086703c94c2637e240672a4a90a30ad9"

# Create a list of country names for location extraction
country_list = [country.name for country in pycountry.countries]

In [4]:
# @title Defining a function to extract country mentions from text
def extract_countries(text):
    """Find country names mentioned in the text"""
    found_countries = []
    for country in country_list:
        if re.search(r'\b' + re.escape(country) + r'\b', text, re.IGNORECASE):
            found_countries.append(country)
    return ', '.join(found_countries) if found_countries else 'Not specified'

## Task 1: Web Scraping
Collecting news headlines from Google News RSS and NewsData.io API

In [5]:
# @title Scrapeing news from Google News RSS feed
print("Scraping Google News RSS feeds...")
google_news = []

for keyword in keywords:
    feed_url = f'https://news.google.com/rss/search?q={keyword}+business&hl=en-US&gl=US&ceid=US:en'
    news_feed = feedparser.parse(feed_url)

    for entry in news_feed.entries[:5]:
        google_news.append({
            'title': entry.title,
            'url': entry.link,
            'timestamp': datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z').strftime('%Y-%m-%d %H:%M:%S'),
            'source': 'Google News'
        })
        time.sleep(0.5)

print(f"Collected {len(google_news)} articles from Google News")


Scraping Google News RSS feeds...
Collected 25 articles from Google News


In [6]:
# @title Connecting and Collecting news from NewsAPI
print("Getting news from NewsAPI...")
newsapi_articles = []

# NewsAPI endpoint for the free plan
url = f'https://newsdata.io/api/1/news?apikey={newsapi_key}&q=electronics OR semiconductor OR manufacturing&language=en&category=business,technology'

response = requests.get(url)
if response.status_code == 200:
    data = response.json()
    articles = data.get('results', [])

    for article in articles:
        if article.get('title') and article.get('link'):
            newsapi_articles.append({
                'title': article['title'],
                'url': article['link'],
                'timestamp': article.get('pubDate', 'Unknown'),
                'source': article.get('source_id', 'NewsAPI')
            })

print(f"Collected {len(newsapi_articles)} articles from NewsAPI")

Getting news from NewsAPI...
Collected 10 articles from NewsAPI


# **Feature Engineering**

In [7]:
# @title Combining all news sources and remove duplicates
print("Combining all news sources...")
all_articles = google_news + newsapi_articles

# Remove duplicates based on title
unique_articles = []
unique_titles = set()

for article in all_articles:
    if article['title'] not in unique_titles:
        unique_titles.add(article['title'])
        unique_articles.append(article)
# Limit to 20 articles
final_articles = unique_articles[:20]
print(f"Selected {len(final_articles)} unique articles for analysis")

Combining all news sources...
Selected 20 unique articles for analysis


In [8]:
# @title Extracting country mentions and create the final dataset
print("Extracting country mentions...")
for article in final_articles:
    # Extract country information
    countries = extract_countries(article['title'])

    # Add to news data
    news_data.append({
        'title': article['title'],
        'url': article['url'],
        'timestamp': article['timestamp'],
        'source': article.get('source', 'Unknown'),
        'country': countries
    })
# Create a pandas DataFrame
news_df = pd.DataFrame(news_data)
print("News data collected and organized into DataFrame")

Extracting country mentions...
News data collected and organized into DataFrame


In [9]:
# @title Setting up and perform sentiment analysis
print("Performing sentiment analysis...")
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment category based on compound score
def get_sentiment_category(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Analyze each headline
news_df['sentiment_score'] = news_df['title'].apply(lambda x: sid.polarity_scores(x)['compound'])
news_df['sentiment'] = news_df['sentiment_score'].apply(get_sentiment_category)

print("Sentiment analysis complete")
print(f"Sentiment distribution: Positive: {sum(news_df['sentiment'] == 'Positive')}, " +
      f"Neutral: {sum(news_df['sentiment'] == 'Neutral')}, " +
      f"Negative: {sum(news_df['sentiment'] == 'Negative')}")


Performing sentiment analysis...
Sentiment analysis complete
Sentiment distribution: Positive: 2, Neutral: 14, Negative: 4


## Interactive Results & Visualizations
Explore the results through interactive charts and tables

In [10]:
# @title collected News data
print("Displaying collected news data:")
news_df


Displaying collected news data:


Unnamed: 0,title,url,timestamp,source,country,sentiment_score,sentiment
0,Electronics company posts bill with over $36K ...,https://news.google.com/rss/articles/CBMiowFBV...,2025-05-09 20:06:34,Google News,Not specified,0.0,Neutral
1,"Samsung | History, Consumer Products, Leadersh...",https://news.google.com/rss/articles/CBMiYkFVX...,2025-05-08 07:13:00,Google News,Not specified,0.0,Neutral
2,DuPont Announces CEO and Non-Executive Chair f...,https://news.google.com/rss/articles/CBMiwwFBV...,2025-03-17 10:50:10,Google News,Not specified,0.0,Neutral
3,Samsung Electronics says unit Harman acquires ...,https://news.google.com/rss/articles/CBMi1AFBV...,2025-05-07 03:31:04,Google News,Not specified,0.0,Neutral
4,DuPont Files Form 10 To Spin Off Electronics B...,https://news.google.com/rss/articles/CBMingFBV...,2025-04-25 07:00:00,Google News,Not specified,0.0,Neutral
5,A timeline of the U.S. semiconductor market in...,https://news.google.com/rss/articles/CBMijgFBV...,2025-05-10 14:00:00,Google News,Not specified,0.0,Neutral
6,Huawei and other Chinese chip firms are catchi...,https://news.google.com/rss/articles/CBMipAFBV...,2025-05-08 13:01:04,Google News,Not specified,0.0,Neutral
7,Jim Cramer Says Taiwan Semiconductor Manufactu...,https://news.google.com/rss/articles/CBMiigFBV...,2025-05-09 14:23:31,Google News,Not specified,-0.2263,Negative
8,US Chip Tariffs: How Might Europe Respond? - C...,https://news.google.com/rss/articles/CBMidEFVX...,2025-05-09 16:18:05,Google News,Not specified,0.0,Neutral
9,A tale of two cities: Korea’s red tape slows S...,https://news.google.com/rss/articles/CBMi_AFBV...,2025-05-09 07:29:56,Google News,Not specified,0.0,Neutral


In [11]:
# @title A bar chart for sentiment distribution
print("Creating sentiment distribution chart...")
sentiment_counts = news_df['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']

# Define a color map for sentiments
color_map = {'Positive': '#4CAF50', 'Neutral': '#2196F3', 'Negative': '#F44336'}

# Create the bar chart
fig = px.bar(
    sentiment_counts,
    x='Sentiment',
    y='Count',
    title='Sentiment Distribution of News Headlines',
    color='Sentiment',
    color_discrete_map=color_map,
    text='Count'
)

fig.update_layout(
    xaxis_title='Sentiment Category',
    yaxis_title='Number of Headlines',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(size=14),
    height=500
)

fig.update_traces(textposition='auto')
fig.show()


Creating sentiment distribution chart...


In [12]:
# @title Table of news articles
print("Creating interactive news table...")
table_fig = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Title</b>', '<b>Timestamp</b>', '<b>Source</b>', '<b>Country</b>', '<b>Sentiment</b>'],
        fill_color='#2196F3',
        align='left',
        font=dict(color='white', size=12)
    ),
    cells=dict(
        values=[
            news_df['title'],
            news_df['timestamp'],
            news_df['source'],
            news_df['country'],
            news_df['sentiment']
        ],
        fill_color=[
            ['white'] * len(news_df),
            ['white'] * len(news_df),
            ['white'] * len(news_df),
            ['white'] * len(news_df),
            [
                '#4CAF50' if sentiment == 'Positive' else
                '#F44336' if sentiment == 'Negative' else '#FFC107'
                for sentiment in news_df['sentiment']
            ]
        ],
        align='left',
        font=dict(size=11),
        height=30
    )
)])

table_fig.update_layout(
    title='Collected News Articles',
    margin=dict(l=10, r=10, t=30, b=10),
    height=600
)

table_fig.show()

Creating interactive news table...


In [13]:
# @title A comprehensive dashboard with multiple visualizations
print("Creating dashboard...")

# A source distribution pie chart
source_counts = news_df['source'].value_counts().reset_index()
source_counts.columns = ['Source', 'Count']

fig_dashboard = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Sentiment Distribution of News Headlines", "News Source Distribution"),
    specs=[[{"type": "bar"}], [{"type": "pie"}]],
    vertical_spacing=0.3,
    row_heights=[0.5, 0.5]
)

# sentiment distribution bar chart (top)
fig_dashboard.add_trace(
    go.Bar(
        x=sentiment_counts['Sentiment'],
        y=sentiment_counts['Count'],
        marker_color=[color_map[sentiment] for sentiment in sentiment_counts['Sentiment']],
        text=sentiment_counts['Count'],
        textposition='auto',
        name='Sentiment Count'
    ),
    row=1, col=1
)

# source distribution pie chart (bottom)
fig_dashboard.add_trace(
    go.Pie(
        labels=source_counts['Source'],
        values=source_counts['Count'],
        hole=0.4,
        name='Source Distribution'
    ),
    row=2, col=1
)

# layout Update
fig_dashboard.update_layout(
    height=800,
    showlegend=False,
    title_text="News Sentiment Analysis Dashboard",
    title_font_size=20,
    plot_bgcolor='rgba(0,0,0,0)'
)

fig_dashboard.show()

Creating dashboard...


In [14]:
# @title Save the data to CSV and HTML files for future reference
news_df.to_csv("scraped_news_data.csv", index=False)
fig.write_html("sentiment_distribution.html")
table_fig.write_html("news_table.html")
fig_dashboard.write_html("news_dashboard.html")

print("Analysis complete! Results saved to CSV and HTML files.")

Analysis complete! Results saved to CSV and HTML files.
