# Import data

In [217]:
import numpy as np
import pandas as pd
import pycountry
import re
import spacy

In [218]:
df = pd.read_csv("../data/bbc_science_and_climate_articles_2010-2023.csv")

# Use NLP to detect location in titles and descriptions

In [219]:
#Convert title and description to string
df['title'] = df['title'].astype(str)
df['description'] = df['description'].astype(str)

# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Extract location entities
#GPE = geopolitical entities (cities/countries)
#LOC = non-GPE, mountain ranges/bodies of water

def extract_locations(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return ', '.join(locations)

# Apply the function to the 'Text' column and create a new column 'Locations' with the results
df['nlp_title'] = df['title'].apply(extract_locations)
df['nlp_descrip'] = df['description'].apply(extract_locations)

#Save file with parsed locations as a csv file (it takes 7 min to rerun the code)
df.to_csv("../data/articles_locations_parsed.csv")


# Merge with US city locality data

In [221]:
#Subset rows that have location data
#10,172 out of 20,102 total rows have location data
df1 = df[df['nlp_title'].notna() | df['nlp_descrip'].notna()]

#Import US cities dataframe
us_locations = pd.read_csv("../data/uscities.csv")

#Use only city
city_list = us_locations['city'].unique().tolist()

#Find titles that match location list
def match_city(row):
    if row['nlp_title'] in city_list:
        return row['nlp_title']
    elif row['nlp_descrip'] in city_list:
        return row['nlp_descrip']
    else:
        return np.nan
    

#Create column that identifies which city matches
df1['city_match'] = df1.apply(match_city, axis=1)

#Subset if data frame has at least one city_match
df2 = df1[df1['city_match'].notna()]

#Keep only important columns
df3 = df2[['title', 'link', 'description', 'published', 'city_match']]

In [222]:
#Drop cities that have duplicate keys (same city name but different locality)
us_cities_filtered = us_locations[us_locations['city'].duplicated(keep=False) == False]

#Keep important columns
us_cities_filtered = us_cities_filtered[['city', 'state_name', 'county_name', 'lat', 'lng']]


In [223]:
#Rename column
df3 = df3.rename(columns = {'city_match':'city'})

#Merge df4 with us_cities_filtered based on 
df4 = df3.merge(us_cities_filtered, on='city', how='inner')


# Remove if city matches a country name or if it's a planet name (there are articles with Mars and Earth as a location)

In [224]:

# Create a list of world country names
countries_info = []
for country in pycountry.countries:
    countries_info.append({
        'country_name': country.name,
        'alpha_2': country.alpha_2,
        'alpha_3': country.alpha_3,
        'official_name': getattr(country, 'official_name', country.name)  # Some countries might not have an official_name attribute.
    })

# Creating a DataFrame from the list of dictionaries.
df_countries = pd.DataFrame(countries_info)

countries_as_lists = {col_name: df_countries[col_name].tolist() for col_name in df_countries.columns}
country_name = countries_as_lists['country_name']
planet_name = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune']


In [225]:
df5 = df4[~df4['city'].isin(country_name)]
final_df = df5[~df5['city'].isin(planet_name)]

In [226]:
len(final_df)

535

In [227]:
final_df['city'].unique()

array(['Isle', 'England', 'Rugby', 'Sardinia', 'Tripoli', 'Russia',
       'Edinburgh', 'Oklahoma', 'Stirling', 'North Wales', 'Mount Etna',
       'Gulf', 'North Sea', 'New York', 'Queensland', 'North Pole',
       'Farmland', 'Chicago', 'Bolivia', 'Hereford', 'New Orleans',
       'Northumberland', 'Perth', 'Munich', 'Amazonia', 'Sun',
       'Johannesburg', 'Belfast', 'Birdsong', 'Suffolk', 'Patagonia',
       'Seattle', 'Island', 'Santiago', 'Tennessee', 'Herculaneum',
       'Truro', 'Cape Canaveral', 'San Francisco', 'San Cristobal',
       'Brussels', 'Hampshire', 'Everest', 'Wimbledon', 'Louisiana',
       'Gulf Stream', 'Zephyr', 'Midwest', 'Brisbane', 'Donegal',
       'Dumfries', 'Maine', 'Edmonton', 'North East', 'Whitehaven',
       'Nile', 'Carmichael', 'Virgin', 'Amanda', 'Marine', 'Tulsa',
       'Baird', 'Ohio', 'Theresa', 'West End'], dtype=object)

# Export data

In [228]:
final_df.to_csv("../data/articles_locations_coordinates.csv")