# Import data

In [2]:
import numpy as np
import pandas as pd
import pycountry
import re
import spacy

In [7]:
df_bbc = pd.read_csv("../data/bbc_science_and_climate_articles_2010-2023.csv")
df_nasa = pd.read_csv("../data/nasa_climate_articles_2020-2023.csv")

In [8]:
df_bbc.head()

Unnamed: 0,title,link,description,published
0,Mars rover's wind sensor damaged,http://www.bbc.co.uk/news/science-environment-...,A sensor on the mast of Nasa's Curiosity rover...,"Tue, 21 Aug 2012 20:03:54 GMT"
1,Bugs sunbathe to 'stay healthy',http://www.bbc.co.uk/nature/19319086,Western boxelder bugs sunbathe to fight off ge...,"Wed, 22 Aug 2012 08:08:03 GMT"
2,Star is caught devouring planet,http://www.bbc.co.uk/news/science-environment-...,Astronomers spot evidence of a distant star co...,"Tue, 21 Aug 2012 15:55:38 GMT"
3,Arctic ice set to hit record low,http://www.bbc.co.uk/news/science-environment-...,Arctic sea ice looks set to reach a record low...,"Tue, 21 Aug 2012 10:55:46 GMT"
4,Shanghai tops 'flood risk list',http://www.bbc.co.uk/news/science-environment-...,Shanghai is the most vulnerable major city in ...,"Tue, 21 Aug 2012 06:58:06 GMT"


In [9]:
df_nasa.head()

Unnamed: 0,title,link,description,published
0,Meteors Great and Small,https://earthobservatory.nasa.gov/images/14869...,While the major meteor showers draw the most a...,"Sat, 14 Aug 2021 00:00:00 -0400"
1,Oceans Primed for Peak of Hurricane Season,https://earthobservatory.nasa.gov/images/14868...,Sea surface temperatures are a fair predictor ...,"Fri, 13 Aug 2021 00:00:00 -0400"
2,Fire Outbreak in Algeria,https://earthobservatory.nasa.gov/images/14868...,"More than 62,000 hectares have burned in the c...","Wed, 11 Aug 2021 18:05:01 -0400"
3,Fire Consumes Large Swaths of Greece,https://earthobservatory.nasa.gov/images/14868...,Fires in the country have consumed five times ...,"Wed, 11 Aug 2021 17:39:23 -0400"
4,Sizing Up How Agriculture Connects to Deforest...,https://earthobservatory.nasa.gov/images/14867...,Using satellite data from the past two decades...,"Wed, 11 Aug 2021 00:00:00 -0400"


In [12]:
df = pd.concat([df_bbc, df_nasa], axis = 0)

In [14]:
df.shape

(20854, 4)

# Use NLP to detect location in titles and descriptions

In [16]:
#Convert title and description to string
df['title'] = df['title'].astype(str)
df['description'] = df['description'].astype(str)

# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Extract location entities
#GPE = geopolitical entities (cities/countries)
#LOC = non-GPE, mountain ranges/bodies of water

def extract_locations(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return ', '.join(locations)

# Apply the function to the 'Text' column and create a new column 'Locations' with the results
df['nlp_title'] = df['title'].apply(extract_locations)
df['nlp_descrip'] = df['description'].apply(extract_locations)

#Save file with parsed locations as a csv file (it takes 7 min to rerun the code)
df.to_csv("../data/bbc_and_nasa_articles_locations_parsed.csv")


# Merge with US city locality data

In [47]:
#Subset rows that have location data
#10,172 out of 20,102 total rows have location data
df1 = df[df['nlp_title'].notna() | df['nlp_descrip'].notna()]

#Import US cities dataframe
us_locations = pd.read_csv("../data/uscities.csv")
us_states = pd.read_csv("../data/us_states.csv")

#Use only city
city_list = us_locations['city'].unique().tolist()
state_list = us_locations['state_name'].unique().tolist()

#Find titles that match city location list
def match_location(row):
    if row['nlp_title'] in city_list or row['nlp_title'] in state_list:
        return row['nlp_title']
    elif row['nlp_descrip'] in city_list or row['nlp_descrip'] in state_list:
        return row['nlp_descrip']
    else:
        return np.nan

#Create column that identifies which city matches
df1['location_match'] = df1.apply(match_location, axis=1)

#Subset if data frame has at least one city_match
df2 = df1[df1['location_match'].notna()]

#Keep only important columns
df3 = df2[['title', 'link', 'description', 'published', 'location_match']]

In [48]:
df3.head()

Unnamed: 0,title,link,description,published,location_match
0,Mars rover's wind sensor damaged,http://www.bbc.co.uk/news/science-environment-...,A sensor on the mast of Nasa's Curiosity rover...,"Tue, 21 Aug 2012 20:03:54 GMT",Mars
2,Star is caught devouring planet,http://www.bbc.co.uk/news/science-environment-...,Astronomers spot evidence of a distant star co...,"Tue, 21 Aug 2012 15:55:38 GMT",Earth
7,Nasa selects another Mars mission,http://www.bbc.co.uk/news/science-environment-...,Just two weeks after landing its Curiosity rov...,"Mon, 20 Aug 2012 20:39:33 GMT",Mars
16,Nasa's Mars rover zaps first rock,http://www.bbc.co.uk/news/science-environment-...,Nasa's Curiosity rover deploys its laser instr...,"Mon, 20 Aug 2012 11:32:28 GMT",Mars
19,Scott's ship found off Greenland,http://www.bbc.co.uk/news/science-environment-...,The wreck of the ship that carried Captain Rob...,"Thu, 16 Aug 2012 14:52:21 GMT",Greenland


In [49]:
#Drop cities that have duplicate keys (same city name but different locality)
us_cities_filtered = us_locations[us_locations['city'].duplicated(keep=False) == False]

#Keep important columns
us_cities_filtered = us_cities_filtered[['city', 'state_name', 'county_name', 'lat', 'lng', 'zips']]


In [50]:
locations_map = df3.set_index('location_match')[['title', 'link', 'description', 'published']]
locations_map

Unnamed: 0_level_0,title,link,description,published
location_match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mars,Mars rover's wind sensor damaged,http://www.bbc.co.uk/news/science-environment-...,A sensor on the mast of Nasa's Curiosity rover...,"Tue, 21 Aug 2012 20:03:54 GMT"
Earth,Star is caught devouring planet,http://www.bbc.co.uk/news/science-environment-...,Astronomers spot evidence of a distant star co...,"Tue, 21 Aug 2012 15:55:38 GMT"
Mars,Nasa selects another Mars mission,http://www.bbc.co.uk/news/science-environment-...,Just two weeks after landing its Curiosity rov...,"Mon, 20 Aug 2012 20:39:33 GMT"
Mars,Nasa's Mars rover zaps first rock,http://www.bbc.co.uk/news/science-environment-...,Nasa's Curiosity rover deploys its laser instr...,"Mon, 20 Aug 2012 11:32:28 GMT"
Greenland,Scott's ship found off Greenland,http://www.bbc.co.uk/news/science-environment-...,The wreck of the ship that carried Captain Rob...,"Thu, 16 Aug 2012 14:52:21 GMT"
...,...,...,...,...
Earth,Typhoon Hinnamnor,https://earthobservatory.nasa.gov/images/15029...,"On August 30, the storm became the first categ...","Thu, 01 Sep 2022 18:09:36 -0400"
China,Parched Poyang Lake,https://earthobservatory.nasa.gov/images/15028...,Prolonged heat and drought have drained China’...,"Thu, 01 Sep 2022 16:47:57 -0400"
Russia,From Russia with Questions,https://earthobservatory.nasa.gov/images/14796...,Researchers are puzzling over a distinctive st...,"Tue, 23 Feb 2021 00:00:00 -0500"
Earth,Hints of a Recent Eruption,https://earthobservatory.nasa.gov/images/14795...,Photographing an ongoing volcanic eruption any...,"Sun, 21 Feb 2021 00:00:00 -0500"


In [51]:
us_cities_filtered.columns

Index(['city', 'state_name', 'county_name', 'lat', 'lng', 'zips'], dtype='object')

In [52]:
df4 = pd.concat([pd.merge(us_cities_filtered, df3, left_on='city', right_on = 'location_match'),
           pd.merge(us_states, df3, left_on='state_name', right_on='location_match')])

In [53]:
df4.shape

(2116, 11)

In [55]:
df4.head(50)

Unnamed: 0,city,state_name,county_name,lat,lng,zips,title,link,description,published,location_match
0,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,How ancient collision shaped New York skyline,http://www.bbc.co.uk/news/science-environment-...,How the collision of ancient landmasses framed...,"Fri, 07 Jun 2013 01:26:25 GMT",New York
1,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New York 'at risk' as seas rise,http://www.bbc.co.uk/go/rss/int/news/-/news/sc...,New York is set to be a major loser as a resul...,"Fri, 08 Apr 2011 12:00:28 GMT",New York
2,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: Get up close to shuttle Enterprise,http://www.bbc.co.uk/news/science-environment-...,Nasa's shuttle Enterprise has moved to a new h...,"Thu, 19 Jul 2012 12:53:27 GMT",New York
3,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: 'Bio-hackers' create New York lab,http://www.bbc.co.uk/go/rss/int/news/-/news/wo...,A group of scientists have created the first c...,"Wed, 15 Feb 2012 03:26:00 GMT",New York
4,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,Lab rodents drowned in NY floods,http://www.bbc.co.uk/news/science-environment-...,Thousands of lab rodents have drowned after st...,"Thu, 01 Nov 2012 15:43:51 GMT",New York
5,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: Shuttle cruises through New York,http://www.bbc.co.uk/news/world-us-canada-1834...,The space shuttle Enterprise is making the fin...,"Wed, 06 Jun 2012 16:37:45 GMT",New York
6,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,DNA pioneer's Nobel Prize auctioned,http://www.bbc.co.uk/news/world-us-canada-2211...,The Nobel Prize won by British scientist Franc...,"Thu, 11 Apr 2013 17:54:10 GMT",New York
7,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: 'Duelling dinosaurs' for auction,http://www.bbc.co.uk/news/world-us-canada-2495...,Two dinosaurs thought to have been locked in c...,"Fri, 15 Nov 2013 11:33:09 GMT",New York
8,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New frog species spotted in NYC,http://www.bbc.co.uk/go/rss/int/news/-/news/wo...,Scientists say they have spotted a new species...,"Wed, 14 Mar 2012 15:41:46 GMT",New York
9,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New York's real-world 'Sim City',http://www.bbc.co.uk/news/technology-19753721#...,A computer game is being used to plan changes ...,"Tue, 02 Oct 2012 06:30:19 GMT",New York


# Remove if city matches a country name or if it's a planet name (there are articles with Mars and Earth as a location)

In [56]:

# Create a list of world country names
countries_info = []
for country in pycountry.countries:
    countries_info.append({
        'country_name': country.name,
        'alpha_2': country.alpha_2,
        'alpha_3': country.alpha_3,
        'official_name': getattr(country, 'official_name', country.name)  # Some countries might not have an official_name attribute.
    })

# Creating a DataFrame from the list of dictionaries.
df_countries = pd.DataFrame(countries_info)

countries_as_lists = {col_name: df_countries[col_name].tolist() for col_name in df_countries.columns}
country_name = countries_as_lists['country_name']
planet_name = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune']


In [57]:
df5 = df4[~df4['city'].isin(country_name)]
final_df = df5[~df5['city'].isin(planet_name)]

In [58]:
len(final_df)

890

In [59]:
final_df['city'].unique()

array(['New York', 'Chicago', 'Seattle', 'San Francisco', 'New Orleans',
       'Tulsa', 'Santa Barbara', 'Suffolk', 'Carmichael', 'La Palma',
       'Hereford', 'Cape Canaveral', 'Hampshire', 'Belfast', 'North Sea',
       'Dumfries', 'Brisbane', 'Herculaneum', 'Edinburgh', 'North East',
       'Northumberland', 'North Wales', 'Louisiana', 'Stirling',
       'North Pole', 'Maine', 'Rugby', 'England', 'West End', "L'Anse",
       'Edmonton', 'Queensland', 'Baird', 'Farmland', 'Theresa',
       'Sardinia', 'Tripoli', 'Marine', 'Oklahoma', 'Gulf Stream',
       'Patagonia', 'Russia', 'Amanda', 'Virgin', 'Isle', 'Island',
       'Truro', 'Sun', 'Vermont', 'Ohio', 'Everest', 'Fire Island',
       'Amazonia', 'Munich', 'Zephyr', 'Bolivia', 'Midwest', 'Whitehaven',
       'Wimbledon', 'Brussels', 'Mount Etna', 'Nile', 'Donegal',
       'San Cristobal', 'Johannesburg', 'Tennessee', 'Santiago', 'Gulf',
       'Birdsong', 'Perth', nan], dtype=object)

In [60]:
final_df['state_name'].unique()

array(['New York', 'Illinois', 'Washington', 'California', 'Louisiana',
       'Oklahoma', 'Virginia', 'Texas', 'Florida', 'Maine', 'Missouri',
       'Indiana', 'Maryland', 'Pennsylvania', 'New Jersey', 'Alaska',
       'Wisconsin', 'North Dakota', 'Arkansas', 'Michigan', 'Kentucky',
       'Ohio', 'Iowa', 'Arizona', 'Utah', 'Minnesota', 'Kansas',
       'North Carolina', 'Wyoming', 'New Mexico', 'Connecticut',
       'Georgia', 'Hawaii', 'Massachusetts', 'Mississippi', 'Montana',
       'Nevada', 'Oregon', 'Tennessee', 'Vermont', 'West Virginia'],
      dtype=object)

In [62]:
confused_list =['Suffolk','Belfast','North Sea','Hampshire', 'Northumberland','Dumfries','Brisbane','Herculaneum','Edinburgh','North East','North Wales',
                'Louisiana','Stirling','Maine','Rugby','England','West End','Edmonton','Sardinia','Tripoli','Queensland','Oklahoma','Isle','Munich','Johannesburg',
               'Ohio','Everest','Wimbledon','San Cristobal','Tennessee','Santiago','Perth','Vermont','Bolivia','Brussels','Mount Etna','Russia','Patagonia','Amanda',
                'Amazonia','Donegal','Nile']

In [64]:
final_df = final_df.loc[~final_df['city'].isin(confused_list)]
final_df.shape

(446, 11)

In [65]:
final_df.head(30)

Unnamed: 0,city,state_name,county_name,lat,lng,zips,title,link,description,published,location_match
0,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,How ancient collision shaped New York skyline,http://www.bbc.co.uk/news/science-environment-...,How the collision of ancient landmasses framed...,"Fri, 07 Jun 2013 01:26:25 GMT",New York
1,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New York 'at risk' as seas rise,http://www.bbc.co.uk/go/rss/int/news/-/news/sc...,New York is set to be a major loser as a resul...,"Fri, 08 Apr 2011 12:00:28 GMT",New York
2,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: Get up close to shuttle Enterprise,http://www.bbc.co.uk/news/science-environment-...,Nasa's shuttle Enterprise has moved to a new h...,"Thu, 19 Jul 2012 12:53:27 GMT",New York
3,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: 'Bio-hackers' create New York lab,http://www.bbc.co.uk/go/rss/int/news/-/news/wo...,A group of scientists have created the first c...,"Wed, 15 Feb 2012 03:26:00 GMT",New York
4,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,Lab rodents drowned in NY floods,http://www.bbc.co.uk/news/science-environment-...,Thousands of lab rodents have drowned after st...,"Thu, 01 Nov 2012 15:43:51 GMT",New York
5,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: Shuttle cruises through New York,http://www.bbc.co.uk/news/world-us-canada-1834...,The space shuttle Enterprise is making the fin...,"Wed, 06 Jun 2012 16:37:45 GMT",New York
6,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,DNA pioneer's Nobel Prize auctioned,http://www.bbc.co.uk/news/world-us-canada-2211...,The Nobel Prize won by British scientist Franc...,"Thu, 11 Apr 2013 17:54:10 GMT",New York
7,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,VIDEO: 'Duelling dinosaurs' for auction,http://www.bbc.co.uk/news/world-us-canada-2495...,Two dinosaurs thought to have been locked in c...,"Fri, 15 Nov 2013 11:33:09 GMT",New York
8,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New frog species spotted in NYC,http://www.bbc.co.uk/go/rss/int/news/-/news/wo...,Scientists say they have spotted a new species...,"Wed, 14 Mar 2012 15:41:46 GMT",New York
9,New York,New York,Queens,40.6943,-73.9249,11229 11226 11225 11224 11223 11221 11220 1138...,New York's real-world 'Sim City',http://www.bbc.co.uk/news/technology-19753721#...,A computer game is being used to plan changes ...,"Tue, 02 Oct 2012 06:30:19 GMT",New York


# Export data

In [66]:
final_df.to_csv("../data/bbc_and_nasa_articles_locations_coordinates.csv")