<a href="https://colab.research.google.com/github/arunvithyasegar/News_sentiment_Analysis/blob/main/Test_Ground_Sentiment_analysis_from_NEWS_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install Required Packages
!pip install textblob
!python -m textblob.download_corpora
!pip install vaderSentiment
!pip install plotly

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [None]:
# 2. Import Required Libraries
import requests
import json
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# 3. Scrape the data from public news site
base_url = "https://newsdata.io/api/1/latest"
key = "pub_86076086703c94c2637e240672a4a90a30ad9"  # Replace with your actual API key
category = "business"
contents = ["electronics", "semiconductors", "manufacturing"]
num_news = 30  # Number of news articles to retrieve per topic

def get_news(base_url, key, category, contents, num_news):
    headlines = []
    for content in contents:
        url = f"{base_url}?apikey={key}&q={content}&category={category}"
        page = 0
        content_headlines_count = 0
        while content_headlines_count < num_news:
            if page:
                current_url = f"{url}&page={page}"
            else:
                current_url = url
            response = requests.get(current_url)
            if response.status_code == 200:
                data = json.loads(response.text)
                results = data.get("results", [])
                if not results:
                    break  # No more results to fetch

                for i in range(len(results)):
                    news = results[i]
                    title = news.get("title")
                    link = news.get("link")
                    country = news.get("country")
                    pubDate = news.get("pubDate")

                    if title and link:
                        headlines.append({
                            "title": title,
                            "link": link,
                            "country": country,
                            "pubDate": pubDate,
                            "business_headline": content
                        })
                page = data.get("nextPage")
                content_headlines_count += len(results)
                print(content, content_headlines_count)
            else:
                print(f"Error fetching news for '{content}': {response.status_code}")
                break

    return headlines

In [None]:
# Get the news headlines
news_headlines = get_news(base_url, key, category, contents, num_news)

# 4. Data Preprocessing
headlines_df = pd.DataFrame(news_headlines)
print("Duplicate Titles", headlines_df["title"].duplicated().sum())
# Remove duplicates
headlines_df.drop_duplicates(subset='title', inplace=True)

def process_dataframe(df):
    # Group by headline and sample 20 rows
    sampled_df = df.groupby('business_headline').head(20)
    return sampled_df

df = process_dataframe(headlines_df)
df.reset_index(drop=True, inplace=True)

electronics 10
electronics 20
electronics 30
semiconductors 10
semiconductors 20
semiconductors 30
manufacturing 10
manufacturing 20
manufacturing 30
Duplicate Titles 12


In [None]:
df.head()

Unnamed: 0,title,link,country,pubDate,business_headline,sentiment
0,Olympian Motors and Foxconn Partner to Launch ...,https://www.globenewswire.com/fr/news-release/...,[france],2025-05-10 03:45:00,electronics,Neutral
1,Dalal St bleeds on intensifying strikes,https://www.thehansindia.com/business/dalal-st...,[india],2025-05-10 03:42:44,electronics,Negative
2,Getting Big Batteries (BESS) & Pumped Hydro (P...,https://cleantechnica.com/2025/05/09/getting-b...,[united states of america],2025-05-10 03:24:33,electronics,Positive
3,Jorge Hurtado presidirá la planta española d...,https://www.economiadigital.es/valencia/empres...,[spain],2025-05-10 02:55:00,electronics,Negative
4,India-Pakistan war buzz: HAL vs BEL vs Mazagon...,https://www.livemint.com/market/stock-market-n...,[india],2025-05-10 02:29:13,electronics,Negative


In [None]:
# 5. Sentiment Analysis
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    compound_score = scores['compound']

    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
# Apply sentiment analysis
df['sentiment'] = df['title'].apply(analyze_sentiment)

# Display the first few rows with sentiment analysis
df[['title', 'business_headline', 'sentiment']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['title'].apply(analyze_sentiment)


Unnamed: 0,title,business_headline,sentiment
0,Olympian Motors and Foxconn Partner to Launch ...,electronics,Neutral
1,Dalal St bleeds on intensifying strikes,electronics,Negative
2,Getting Big Batteries (BESS) & Pumped Hydro (P...,electronics,Positive
3,Jorge Hurtado presidirá la planta española d...,electronics,Negative
4,India-Pakistan war buzz: HAL vs BEL vs Mazagon...,electronics,Negative


In [None]:
# 6. Visualize Sentiment Distribution

# a. Overall Sentiment Distribution
sentiment_counts = df['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']

fig1 = px.bar(
    sentiment_counts,
    x='Sentiment',
    y='Count',
    color='Sentiment',
    color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'},
    title='Distribution of Sentiments across Headlines (electronics, semiconductors, manufacturing)'
)

fig1.update_layout(
    xaxis_title='Sentiment',
    yaxis_title='Number of Headlines',
    template='plotly_white'
)
fig1.show()


In [None]:
# b. Sentiment Distribution by Topic
topic_sentiment = df.groupby(['business_headline', 'sentiment']).size().reset_index(name='Count')

fig2 = px.bar(
    topic_sentiment,
    x='business_headline',
    y='Count',
    color='sentiment',
    color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'},
    title='Sentiment Distribution by Topic',
    barmode='group'
)

fig2.update_layout(
    xaxis_title='Topic',
    yaxis_title='Number of Headlines',
    legend_title='Sentiment',
    template='plotly_white'
)
fig2.show()

In [None]:
# c. Pie Chart of Sentiment Distribution
fig3 = px.pie(
    sentiment_counts,
    values='Count',
    names='Sentiment',
    color='Sentiment',
    color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'},
    title='Proportion of Sentiments'
)
fig3.show()

In [None]:
# Save results to CSV if needed
df.to_csv('news_sentiment_analysis.csv', index=False)

print("Sentiment analysis completed!")

Sentiment analysis completed!


In [3]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [6]:
!pip install lxml-html-clean

Collecting lxml-html-clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml-html-clean
Successfully installed lxml-html-clean-0.4.2


In [8]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1
