# Using sample RSS Beautiful Soup dataframe to test

In [186]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pycountry
import re

In [187]:
# Function to get the various attributes of the article
def getArticles(articles):
    all_articles = []
    for article in articles:
        article_title = article.find('title').text
        article_link = getattr(article.find('link'), 'text', None)
        article_desc = getattr(article.find('description'), 'text', None)
        article_published = getattr(article.find('pubDate'), 'text', None)
        all_articles.append({
            'title':article_title,
            'link':article_link,
            'description':article_desc,
            'published':article_published
        })
    return all_articles
    
# Function to invoke CNN Scrapper
def cnn_news_scrapper(URL):
    try:
        r = requests.get(URL)
        soupContent = BeautifulSoup(r.content,'xml')
        print('Job Succeeded returning Status Code: ', r.status_code)
        items = soupContent.findAll('item')
        print('Total News Content')
        print(len(items))
        print(items)
        return getArticles(soupContent.findAll('item'))
    except Exception as e:
        print('Scraping failed due to the below exception')
        print(e)

In [188]:
data = cnn_news_scrapper('https://feeds.bbci.co.uk/news/science_and_environment/rss.xml?edition=uk')

Job Succeeded returning Status Code:  200
Total News Content
19
[<item>
<title>Alien life in Universe: Scientists say finding it is 'only a matter of time'</title>
<description>Experts are optimistic of detecting life signs on a faraway world within our lifetimes - possibly in the next few years.</description>
<link>https://www.bbc.co.uk/news/science-environment-66950930?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66950930</guid>
<pubDate>Fri, 29 Sep 2023 23:20:01 GMT</pubDate>
</item>, <item>
<title>UK unready as wildfires surge, warns firefighters' union</title>
<description>Wildfire response across the UK is an under-resourced "postcode lottery", claims a new union report.</description>
<link>https://www.bbc.co.uk/news/science-environment-66948836?at_medium=RSS&amp;at_campaign=KARANGA</link>
<guid isPermaLink="false">https://www.bbc.co.uk/news/science-environment-66948836</guid>
<pubDate>Fri, 29 Sep 2023 01:22

In [189]:
df = pd.DataFrame(data)
df.head(1)

Unnamed: 0,title,link,description,published
0,Alien life in Universe: Scientists say finding...,https://www.bbc.co.uk/news/science-environment...,Experts are optimistic of detecting life signs...,"Fri, 29 Sep 2023 23:20:01 GMT"


# Method 1: Find location based on list of countries/states within the US 

In [190]:
# Creating a list of dictionaries, where each dictionary contains information about a country.
countries_info = []
for country in pycountry.countries:
    countries_info.append({
        'country_name': country.name,
        'alpha_2': country.alpha_2,
        'alpha_3': country.alpha_3,
        'official_name': getattr(country, 'official_name', country.name)  # Some countries might not have an official_name attribute.
    })

# Creating a DataFrame from the list of dictionaries.
df_countries = pd.DataFrame(countries_info)

In [191]:
# Extracting US state information from pycountry.
states_info = [{'state_code': subdiv.code.split('-')[1], 'state_name': subdiv.name} for subdiv in pycountry.subdivisions.get(country_code='US')]

# Creating a DataFrame from the list of dictionaries.
df_states = pd.DataFrame(states_info)

# Display the first few rows of the DataFrame.
print(df_states.head())

  state_code      state_name
0         CO        Colorado
1         IA            Iowa
2         MD        Maryland
3         NC  North Carolina
4         OH            Ohio


In [192]:
# Creating a dictionary to hold each column as a separate list
countries_as_lists = {col_name: df_countries[col_name].tolist() for col_name in df_countries.columns}

# Creating a dictionary to hold each column as a separate list
states_as_lists = {col_name: df_states[col_name].tolist() for col_name in df_states.columns}


In [193]:
country_name = columns_as_lists['country_name']
country_code2 = columns_as_lists['alpha_2']
country_code3 = columns_as_lists['alpha_3']
state_name = states_as_lists['state_name']
state_code = states_as_lists['state_code']

## If title/description matches any item in country/state list, create column identifying location that matches

In [194]:
pttrn = f"({'|'.join(country_name)})"
title_country = df['title'].str.extract(pttrn, flags=re.IGNORECASE, expand=False)
df['title_country'] = title_country[0]

pttrn = f"({'|'.join(state_name)})"
df['title_state'] = df['title'].str.extract(pttrn, flags=re.IGNORECASE, expand=False)

pttrn = f"({'|'.join(country_name)})"
descrip_country = df['description'].str.extract(pttrn, flags=re.IGNORECASE, expand=False)
df['descrip_country'] = descrip_country[0]

pttrn = f"({'|'.join(state_name)})"
df['descrip_state'] = df['description'].str.extract(pttrn, flags=re.IGNORECASE, expand=False)

In [195]:
df

Unnamed: 0,title,link,description,published,title_country,title_state,descrip_country,descrip_state
0,Alien life in Universe: Scientists say finding...,https://www.bbc.co.uk/news/science-environment...,Experts are optimistic of detecting life signs...,"Fri, 29 Sep 2023 23:20:01 GMT",,,,
1,"UK unready as wildfires surge, warns firefight...",https://www.bbc.co.uk/news/science-environment...,Wildfire response across the UK is an under-re...,"Fri, 29 Sep 2023 01:22:34 GMT",,,,
2,Single-use plastic ban: Some firms unaware of ...,https://www.bbc.co.uk/news/business-66946643?a...,Plastic cutlery is banned in England from Sund...,"Fri, 29 Sep 2023 10:57:49 GMT",,,,
3,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT",,,,
4,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT",,,,
5,Government to delay new environmental building...,https://www.bbc.co.uk/news/science-environment...,Environmentalists say delaying the implementat...,"Tue, 26 Sep 2023 23:00:49 GMT",,,,
6,Climate change: Six young people take 32 count...,https://www.bbc.co.uk/news/world-europe-669235...,They claim governments' slow action on climate...,"Wed, 27 Sep 2023 00:53:18 GMT",,,,
7,Water firms forced to pay back customers for p...,https://www.bbc.co.uk/news/business-66922070?a...,Regulator Ofwat orders companies in England an...,"Tue, 26 Sep 2023 11:51:13 GMT",,,,
8,"Richest oil states should pay climate tax, say...",https://www.bbc.co.uk/news/uk-politics-6690639...,The former Labour PM wants the wealthiest oil ...,"Mon, 25 Sep 2023 05:00:05 GMT",,,,
9,Africa proposes global carbon taxes to fight c...,https://www.bbc.co.uk/news/world-africa-667335...,Despite suffering some of the worst impacts of...,"Thu, 07 Sep 2023 07:39:58 GMT",,,,


# Method 2: Use NLP to detect location

In [196]:
#Download English
# !pip install spacy
# !python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [179]:
import spacy

# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Extract location entities
#GPE = geopolitical entities (cities/countries)
#LOC = non-GPE, mountain ranges/bodies of water

def extract_locations(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]

# Apply the function to the 'Text' column and create a new column 'Locations' with the results
df['nlp_title'] = df['title'].apply(extract_locations)
df['nlp_descrip'] = df['description'].apply(extract_locations)

# Display the updated DataFrame
display(df)

Unnamed: 0,title,link,description,published,title_country,title_state,descrip_country,descrip_state,Locations,nlp_title,nlp_descrip
0,Alien life in Universe: Scientists say finding...,https://www.bbc.co.uk/news/science-environment...,Experts are optimistic of detecting life signs...,"Fri, 29 Sep 2023 23:20:01 GMT",,,,,[],[],[]
1,"UK unready as wildfires surge, warns firefight...",https://www.bbc.co.uk/news/science-environment...,Wildfire response across the UK is an under-re...,"Fri, 29 Sep 2023 01:22:34 GMT",,,,,[UK],[UK],[UK]
2,Single-use plastic ban: Some firms unaware of ...,https://www.bbc.co.uk/news/business-66946643?a...,Plastic cutlery is banned in England from Sund...,"Fri, 29 Sep 2023 10:57:49 GMT",,,,,[England],[England],[England]
3,Nature crisis: One in six species at risk of e...,https://www.bbc.co.uk/news/science-environment...,The loss of Britain's wildlife is outpacing ef...,"Wed, 27 Sep 2023 18:07:20 GMT",,,,,[Great Britain],[Great Britain],[Britain]
4,Scientists get closer to solving mystery of an...,https://www.bbc.co.uk/news/science-environment...,The elusive substance holds the key to discove...,"Wed, 27 Sep 2023 15:06:19 GMT",,,,,[],[],[]
5,Government to delay new environmental building...,https://www.bbc.co.uk/news/science-environment...,Environmentalists say delaying the implementat...,"Tue, 26 Sep 2023 23:00:49 GMT",,,,,[],[],[]
6,Climate change: Six young people take 32 count...,https://www.bbc.co.uk/news/world-europe-669235...,They claim governments' slow action on climate...,"Wed, 27 Sep 2023 00:53:18 GMT",,,,,[],[],[]
7,Water firms forced to pay back customers for p...,https://www.bbc.co.uk/news/business-66922070?a...,Regulator Ofwat orders companies in England an...,"Tue, 26 Sep 2023 11:51:13 GMT",,,,,[],[],"[England, Wales]"
8,"Richest oil states should pay climate tax, say...",https://www.bbc.co.uk/news/uk-politics-6690639...,The former Labour PM wants the wealthiest oil ...,"Mon, 25 Sep 2023 05:00:05 GMT",,,,,[],[],[]
9,Africa proposes global carbon taxes to fight c...,https://www.bbc.co.uk/news/world-africa-667335...,Despite suffering some of the worst impacts of...,"Thu, 07 Sep 2023 07:39:58 GMT",,,,,[Africa],[Africa],[]
