# Location Analysis in French

### Setup


In [1]:
# Importing necessary libraries for data manipulation, text processing, and web requests.
import pandas as pd
import spacy
import requests
import re
import unicodedata

In [2]:
# Loading the French language model for SpaCy.
nlp = spacy.load("fr_core_news_sm")

In [6]:
# Loading different datasets for the analysis.
df_french = pd.read_csv('../scraping/data/extractor_all_articles_20minutes.csv')
df_cities_french = pd.read_csv('input/list_cities_french.csv')
df_countries_french = pd.read_csv('input/list_country_capital_french.csv')
df_cantons_french = pd.read_csv('input/list_cantons_french.csv')
french_iso_3166 = pd.read_csv('input/french-iso-3166.csv')

In [8]:
# Define functions for text normalization and cleaning.
def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii').lower()

def apply_normalize_text_if_str(x):
    return normalize_text(x) if isinstance(x, str) else x

In [9]:
# Normalizing specific columns in the datasets to facilitate text processing.
columns_to_normalize = ['Content', 'Title', 'Header']
df_french[columns_to_normalize] = df_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [None]:
columns_to_normalize = ['Name']
french_iso_3166[columns_to_normalize] = french_iso_3166[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [15]:
df_cities_french['Ville'] = df_cities_french['Ville'].apply(apply_normalize_text_if_str)
df_cities_french = df_cities_french[['Ville', 'Coordinates', 'Latitude', 'Longitude']]
df_cities_french = df_cities_french.rename(columns={"Ville": "CityName"})

In [19]:
df_cantons_french['Canton'] = df_cantons_french['Canton'].apply(apply_normalize_text_if_str)
df_cantons_french = df_cantons_french[['Canton']]
df_cantons_french = df_cantons_french.rename(columns={"Canton": "CantonName"})

In [29]:
df_countries_french = df_countries_french.rename(columns = {'Country': 'CountryName', 'Capital': 'CapitalName'})
columns_to_normalize = ['CountryName', 'CapitalName']
df_countries_french[columns_to_normalize] = df_countries_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [65]:
country_name_variations = [
    {"OfficialName": "emirats arabes unis", "OtherNames": ["EAU"]},
    {"OfficialName": "etats-unis", "OtherNames": ["USA", "Les États"]},
    {"OfficialName": "royaume-uni", "OtherNames": ["Angleterre", "Écosse", "Wales", "Pays de Galles", "UK"]},
    {"OfficialName": "republique centrafricaine", "OtherNames": ["Centrafrique"]},
    {"OfficialName": "republique du congo", "OtherNames": ["Congo-Brazzaville"]},
    {"OfficialName": "republique democratique du congo", "OtherNames": ["Congo-Kinshasa", "RDC"]},
    {"OfficialName": "coree du nord", "OtherNames": ["République Populaire Démocratique de Corée"]},
    {"OfficialName": "coree du sud", "OtherNames": ["République de Corée"]},
    {"OfficialName": "viet nam", "OtherNames": ["Vietnam"]},
    {"OfficialName": "pays-bas", "OtherNames": ["Hollande"]},
    {"OfficialName": "russie", "OtherNames": ["Fédération de Russie"]},
    {"OfficialName": "iran", "OtherNames": ["République islamique d'Iran"]},
    {"OfficialName": "chine", "OtherNames": ["République populaire de Chine"]},
    {"OfficialName": "taïwan", "OtherNames": ["République de Chine (Taiwan)"]},
    {"OfficialName": "venezuela", "OtherNames": ["République bolivarienne du Venezuela"]},
    {"OfficialName": "bolivie", "OtherNames": ["État plurinational de Bolivie"]},
    {"OfficialName": "syrie", "OtherNames": ["République arabe syrienne"]},
    {"OfficialName": "macedoine du nord", "OtherNames": ["Macédoine"]},
    {"OfficialName": "slovaquie", "OtherNames": ["République slovaque"]},
    {"OfficialName": "republique tcheque", "OtherNames": ["Tchéquie"]},
    {"OfficialName": "birmanie", "OtherNames": ["Myanmar"]},
    {"OfficialName": "laos", "OtherNames": ["République démocratique populaire lao"]},
    {"OfficialName": "libye", "OtherNames": ["Jamahiriya arabe libyenne"]},
    {"OfficialName": "eswatini", "OtherNames": ["Swaziland"]},
    {"OfficialName": "cap-vert", "OtherNames": ["Cabo Verde"]}
]

In [66]:
df_country_name_variations = pd.DataFrame(country_name_variations)

In [None]:
df_countries_french = df_countries_french.merge(df_country_name_variations, left_on='CountryName', right_on='OfficialName', how='left')
df_countries_french = df_countries_french.drop(['OfficialName'], axis=1)
df_countries_french['OtherNames'] = df_countries_french['OtherNames'].apply(lambda x: x if isinstance(x, list) else [])


In [85]:
df_countries_french = df_countries_french.merge(french_iso_3166, left_on='CountryName', right_on='Name', how='left')
df_countries_french = df_countries_french[['CountryName', 'CapitalName', 'ISO2', 'ISO3']]

**Geocoding Functionality**

In [37]:
geocode_api_key = '1983e85e9a97673a09ed6d19417dda0f'

In [33]:
#Fetch coordinates for a given location name using the Positionstack API

def geocode_location_positionstack(location_name, api_key):
    base_url = "http://api.positionstack.com/v1/forward"
    params = {'access_key': api_key, 'query': location_name, 'limit': 1}
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data['data']:
            latitude = data['data'][0]['latitude']
            longitude = data['data'][0]['longitude']
            return latitude, longitude
        else:
            return None, None
    else:
        print(f"Error: {response.status_code}")
        return None, None

In [34]:
#Add coordinates to a DataFrame row based on a specified column's location name.

def add_coordinates(row, column_name, api_key):
    location_name = row[column_name]
    lat, lng = geocode_location_positionstack(location_name, api_key)
    return pd.Series({'Coordinates': (lat, lng), 'Latitude': lat, 'Longitude': lng})


In [38]:
# Adding coordinates to cities in Switzerland and capitals worldwide.
# Skipping the loop-based coordinate addition due to its redundancy with the apply method.
df_countries_french[['Coordinates', 'Latitude', 'Longitude']] = df_countries_french.apply(
    lambda row: add_coordinates(row, 'CapitalName', geocode_api_key), axis=1)


**Country and City Search Functions**

In [87]:
def find_countries_in_text(content, header, country_data, swiss_cities, swiss_cantons):
    mentioned_countries = set()

    # Analyze the Content using NLP
    doc = nlp(str(content))
    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            text_lower = ent.text.lower()

            # Check if the entity is a Swiss city or canton
            if text_lower in swiss_cities['CityName'].str.lower().values or text_lower in swiss_cantons['CantonName'].str.lower().values:
                mentioned_countries.add("suisse")
                continue  

            # Check if the entity matches the capital name of a country
            country_from_capital = country_data[country_data['CapitalName'].str.lower() == text_lower]['CountryName']
            if not country_from_capital.empty:
                mentioned_countries.add(country_from_capital.iloc[0])
                continue 

            # Check if the entity matches the official name of a country
            country = country_data[country_data['CountryName'].str.lower() == text_lower]['CountryName']
            if not country.empty:
                mentioned_countries.add(country.iloc[0])
                continue 

            # Check if the entity matches any alternative name of a country
            for index, row in country_data.iterrows():
                other_names = row['OtherNames']
                if isinstance(other_names, list) and text_lower in [name.lower() for name in other_names]:
                    mentioned_countries.add(row['CountryName'])
                    break

    # Analyze the Header using simple text matching
    header_lower = header.lower()
    # Check if the header matches any Swiss city or canton
    if header_lower in swiss_cities['CityName'].str.lower().values:
        mentioned_countries.add("suisse")
    elif header_lower in swiss_cantons['CantonName'].str.lower().values:
        mentioned_countries.add("suisse")
    
    # Check if the header matches any capital city
    for index, row in country_data.iterrows():
        if header_lower == row['CapitalName'].lower():
            mentioned_countries.add(row['CountryName'])
            continue

    # Check if the header matches any official country name
    for index, row in country_data.iterrows():
        if header_lower == row['CountryName'].lower():
            mentioned_countries.add(row['CountryName'])
            continue

    # Check if the header matches any alternative country names
    for index, row in country_data.iterrows():
        other_names = row['OtherNames']
        if isinstance(other_names, list) and header_lower in [name.lower() for name in other_names]:
            mentioned_countries.add(row['CountryName'])

    return mentioned_countries

In [88]:
# Apply the combined function to the DataFrame
df_french['Mentioned_Countries'] = df_french.apply(lambda row: find_countries_in_text(row['Content'], row['Header'], df_countries_french, df_cities_french, df_cantons_french), axis=1)


#### Count Country Mentions

In [102]:
# Count and sort country mentions
country_counts = df_french.explode('Mentioned_Countries')['Mentioned_Countries'].value_counts()
df_country_counts = pd.DataFrame({'Country': country_counts.index, 'Count': country_counts.values})
df_country_counts.sort_values(by='Count', ascending=False, inplace=True)

In [103]:
df_country_count_data = df_country_counts.merge(df_countries_french, left_on='Country', right_on='CountryName', how='left')


#### City Search

In [93]:
# Function to find mentioned Swiss cities in text
def find_swiss_cities_in_text(content, header, df_cities_german):
    combined_text = normalize_text(str(content) + " " + str(header)).strip()
    mentioned_swiss_cities = set()
    for city in df_cities_german['CityName']:
        normalized_city = normalize_text(city).strip()
        if re.search(r'\b' + re.escape(normalized_city) + r'\b', combined_text):
            mentioned_swiss_cities.add(city)
    return mentioned_swiss_cities

In [95]:
# Apply the city search function to the French dataframe
df_french['Mentioned_Swiss_Cities'] = df_french.apply(lambda row: find_swiss_cities_in_text(row['Content'], row['Header'], df_cities_french), axis=1)


In [101]:
# Updating Mentioned Countries with Switzerland if Swiss Cities are Mentioned
switzerland_added = False  # Flag to track if 'Schweiz' is added
for idx, row in df_french.iterrows():
    if row['Mentioned_Swiss_Cities'] and "suisse" not in row['Mentioned_Countries']:
        df_french.at[idx, 'Mentioned_Countries'] = row['Mentioned_Countries'].union({"schweiz"})


In [106]:
# Counting and Sorting Mentioned Swiss Cities
city_counts = df_french.explode('Mentioned_Swiss_Cities')['Mentioned_Swiss_Cities'].value_counts()
df_city_counts = pd.DataFrame({'City': city_counts.index, 'Count': city_counts.values})
df_city_counts.sort_values(by='Count', ascending=False, inplace=True)

In [108]:
# Merging City Count Data with Swiss Cities Data
df_city_count_data = df_city_counts.merge(df_cities_french, left_on='City', right_on='CityName', how='left')
df_city_count_data = df_city_count_data[['City', 'Count', 'Coordinates', 'Longitude', 'Latitude']]


 ### Exports

In [None]:
csv_file_path = 'output/data/df_cities_french_with_coordinates.csv'
df_cities_german.to_csv(csv_file_path, index=False)

In [110]:
csv_file_path = 'output/data/df_french_with_mentions.csv'
df_french.to_csv(csv_file_path, index=False)

In [111]:
csv_file_path = 'output/data/df_city_count_data_french.csv'
df_city_count_data.to_csv(csv_file_path, index=False)

In [112]:
csv_file_path = 'output/data/df_country_count_data_french.csv'
df_country_count_data.to_csv(csv_file_path, index=False)