# Import sample data

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pycountry
import re

# Function to get the various attributes of the article
def getArticles(articles):
    all_articles = []
    for article in articles:
        article_title = article.find('title').text
        article_link = getattr(article.find('link'), 'text', None)
        article_desc = getattr(article.find('description'), 'text', None)
        article_published = getattr(article.find('pubDate'), 'text', None)
        all_articles.append({
            'title':article_title,
            'link':article_link,
            'description':article_desc,
            'published':article_published
        })
    return all_articles
    
# Function to invoke CNN Scrapper
def cnn_news_scrapper(URL):
    try:
        r = requests.get(URL)
        soupContent = BeautifulSoup(r.content,'xml')
        print('Job Succeeded returning Status Code: ', r.status_code)
        items = soupContent.findAll('item')
        print('Total News Content')
        print(len(items))
        print(items)
        return getArticles(soupContent.findAll('item'))
    except Exception as e:
        print('Scraping failed due to the below exception')
        print(e)

data = cnn_news_scrapper('https://feeds.bbci.co.uk/news/science_and_environment/rss.xml?edition=uk')

df = pd.DataFrame(data)


Job Succeeded returning Status Code:  200
Total News Content
19
[<item>
<title>Alien life in Universe: Scientists say finding it is 'only a matter of time'</title>
<description>Experts are optimistic of detecting life signs on a faraway world within our lifetimes - possibly in the next few years.</description>
<link>https://www.bbc.co.uk/news/science-environment-66950930?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66950930</guid>
<pubDate>Fri, 29 Sep 2023 23:20:01 GMT</pubDate>
</item>, <item>
<title>UK unready as wildfires surge, warns firefighters' union</title>
<description>Wildfire response across the UK is an under-resourced "postcode lottery", claims a new union report.</description>
<link>https://www.bbc.co.uk/news/science-environment-66948836?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66948836</guid>
<pubDate>Fri, 29 Sep 2023 01:22

In [6]:
df.head()

Unnamed: 0,title,link,description,published
0,Alien life in Universe: Scientists say finding...,https://www.bbc.co.uk/news/science-environment...,Experts are optimistic of detecting life signs...,"Fri, 29 Sep 2023 23:20:01 GMT"
1,"UK unready as wildfires surge, warns firefight...",https://www.bbc.co.uk/news/science-environment...,Wildfire response across the UK is an under-re...,"Fri, 29 Sep 2023 01:22:34 GMT"
2,Single-use plastic ban: Some firms unaware of ...,https://www.bbc.co.uk/news/business-66946643?a...,Plastic cutlery is banned in England from Sund...,"Fri, 29 Sep 2023 10:57:49 GMT"
3,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT"
4,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT"


# Conduct sentiment analysis using SpaCY & VADER

In [13]:
import pandas as pd
from textblob import TextBlob
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load a SpaCy Model
nlp = spacy.load('en_core_web_sm')

# Initialize VADER SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment
def get_sentiment(text):
    # Process text with SpaCy
    doc = nlp(text)
    
    # Convert SpaCy Doc to TextBlob object for sentiment analysis
    blob = TextBlob(doc.text)
    
    # Return the polarity
    return blob.sentiment.polarity

# Function to apply sentiment analysis
def get_vader_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound']  # return the compound score, or return the whole dictionary if needed.

# Apply the function to the text column and create a new column 'sentiment'
df['spacy_title'] = df['title'].apply(get_sentiment)
df['spacy_description'] = df['description'].apply(get_sentiment)
df['vader_title'] = df['title'].apply(get_vader_sentiment)
df['vader_description'] = df['description'].apply(get_vader_sentiment)

In [15]:
df.head(5)

Unnamed: 0,title,link,description,published,title_sentiment,description_sentiment,spacy_title,spacy_description,vader_title,vader_description
0,Alien life in Universe: Scientists say finding...,https://www.bbc.co.uk/news/science-environment...,Experts are optimistic of detecting life signs...,"Fri, 29 Sep 2023 23:20:01 GMT",-0.125,-0.066667,-0.125,-0.066667,0.0258,0.3182
1,"UK unready as wildfires surge, warns firefight...",https://www.bbc.co.uk/news/science-environment...,Wildfire response across the UK is an under-re...,"Fri, 29 Sep 2023 01:22:34 GMT",0.0,0.136364,0.0,0.136364,-0.1027,0.0
2,Single-use plastic ban: Some firms unaware of ...,https://www.bbc.co.uk/news/business-66946643?a...,Plastic cutlery is banned in England from Sund...,"Fri, 29 Sep 2023 10:57:49 GMT",0.068182,0.0,0.068182,0.0,-0.8402,-0.25
3,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT",0.8,0.0625,0.8,0.0625,-0.2732,0.0772
4,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT",0.0,0.0,0.0,0.0,0.34,0.0


In [16]:
# When SpaCY & VADER algorithms differ, how do we want to deal with this? 