## Location Analysis German

#### Setup


In [1]:
# Import necessary libraries
import pandas as pd
import spacy
import requests
import time
import re
import unicodedata

In [2]:
# Loading the German language model for SpaCy.
nlp = spacy.load("de_core_news_sm")

In [3]:
# Read CSV files
df_german = pd.read_csv('../scraping/data/extractor_all_articles_20minuten.csv')
df_french = pd.read_csv('../scraping/data/extractor_all_articles_20minutes.csv')

In [30]:
# Read ISO 3166 data for Germany and English
german_iso_3166 = pd.read_csv('input/german-iso-3166.csv', names=['ISO2', 'Name'])

In [4]:
# Function to normalize text by removing accents and converting to lowercase
def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii').lower()

In [5]:
# Function to apply text normalization if the column contains strings
def apply_normalize_text_if_str(x):
    return normalize_text(x) if isinstance(x, str) else x

In [161]:
def normalize_list_of_strings(string_list):
    if isinstance(string_list, list):
        return [normalize_text(s) for s in string_list if isinstance(s, str)]
    return string_list

In [31]:
# Normalize selected columns in df_german
columns_to_normalize = ['Content', 'Title', 'Header']
df_german[columns_to_normalize] = df_german[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [226]:
columns_to_normalize = ['Name']
german_iso_3166[columns_to_normalize] = german_iso_3166[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [148]:
# Read data for German countries, cities, and cantons, and normalize relevant columns
df_countries_german = pd.read_csv('input/list_country_capital_german.csv')
df_countries_german = df_countries_german.rename(columns={"Staat": "CountryName", "Hauptstadt": "CapitalName", "Kontinent": "ContinentName"})


In [149]:
# Name Variations

country_name_variations = [
    {"OfficialName": "Vereinigte Staaten von Amerika", "OtherNames": ["USA"]},
    {"OfficialName": "Vereinigtes Königreich", "OtherNames": ["Großbritannien", "Grossbritannien", "England", "Schottland", "Wales", "UK"]},
    {"OfficialName": "Russland", "OtherNames": ["Russische Föderation"]},
    {"OfficialName": "Südkorea", "OtherNames": ["Korea, Republik", "Republik Korea"]},
    {"OfficialName": "Nordkorea", "OtherNames": ["Korea, Demokratische Volksrepublik", "Demokratische Volksrepublik Korea"]},
    {"OfficialName": "Elfenbeinküste", "OtherNames": ["Côte d'Ivoire"]},
    {"OfficialName": "Tschechien", "OtherNames": ["Tschechische Republik"]},
    {"OfficialName": "Bolivien", "OtherNames": ["Bolivien, Plurinationaler Staat"]},
    {"OfficialName": "Iran", "OtherNames": ["Islamische Republik Iran"]},
    {"OfficialName": "Vatikanstadt", "OtherNames": ["Heiliger Stuhl"]},
    {"OfficialName": "Taiwan", "OtherNames": ["Republik China (Taiwan)"]},
    {"OfficialName": "China", "OtherNames": ["Volksrepublik China"]},
    {"OfficialName": "Myanmar (Früher auch birma)", "OtherNames": ["Birma"]},
    {"OfficialName": "Osttimor", "OtherNames": ["Timor-Leste"]},
    {"OfficialName": "Mazedonien", "OtherNames": ["Nordmazedonien"]},
    {"OfficialName": "Palästina", "OtherNames": ["Westjordanland und Gazastreifen"]},
    {"OfficialName": "Niederlande", "OtherNames": ["Holland"]},
    {"OfficialName": "Republik Kongo", "OtherNames": ["Kongo-Brazzaville"]},
    {"OfficialName": "Mazedonien", "OtherNames": ["Nordmazedonien"]},
    {"OfficialName": "Südsudan", "OtherNames": ["Sudan, Süd-"]}
]

In [150]:
# Conversion of the list into a DataFrame
df_country_name_variations = pd.DataFrame(country_name_variations)


In [274]:
df_countries_german = df_countries_german.merge(df_country_name_variations, left_on='CountryName', right_on='OfficialName', how='left')
df_countries_german = df_countries_german.drop(['OfficialName'], axis = 1)

In [275]:
columns_to_normalize = ['CountryName', 'CapitalName', 'ContinentName']
df_countries_german[columns_to_normalize] = df_countries_german[columns_to_normalize].applymap(apply_normalize_text_if_str)
df_countries_german['OtherNames'] = df_countries_german['OtherNames'].apply(normalize_list_of_strings)

KeyError: 'OtherNames'

In [None]:
columns_to_normalize = ['CountryName', 'CapitalName', 'ContinentName']
df_countries_german[columns_to_normalize] = df_countries_german[columns_to_normalize].applymap(apply_normalize_text_if_str)
df_countries_german['OtherNames'] = df_countries_german['OtherNames'].apply(normalize_list_of_strings)

In [43]:
df_cities_german = pd.read_csv('input/list_cities_german.csv')
df_cities_german['Name'] = df_cities_german['Name'].apply(lambda x: normalize_text(x) if isinstance(x, str) else x)
df_cities_german = df_cities_german.rename(columns={"Name": "CityName"})

In [60]:
df_cantons_german = pd.read_csv('input/list_cantons_german.csv')
df_cantons_german = df_cantons_german.rename(columns={"Abk.": "Abbreviation", "Kanton": "CantonName"})
columns_to_normalize = ['Abbreviation', 'CantonName']
df_cantons_german[columns_to_normalize] = df_cantons_german[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [64]:
# Function to geocode a location using Positionstack API
def geocode_location_positionstack(location_name, api_key):
    base_url = "http://api.positionstack.com/v1/forward"
    params = {'access_key': api_key, 'query': location_name, 'limit': 1}
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data['data']:
            latitude = data['data'][0]['latitude']
            longitude = data['data'][0]['longitude']
            return latitude, longitude
        else:
            return None, None
    else:
        print(f"Error: {response.status_code}")
        return None, None
    

In [65]:
# Function to add coordinates to a dataframe row based on a specified column name
def add_coordinates(row, column_name, api_key):
    location_name = row[column_name]
    lat, lng = geocode_location_positionstack(location_name, api_key)
    return pd.Series({'Coordinates': (lat, lng), 'Latitude': lat, 'Longitude': lng})

In [None]:
geocode_api_key = '1983e85e9a97673a09ed6d19417dda0f'
df_cities_german[['Coordinates', 'Latitude', 'Longitude']] = df_cities_german.apply(lambda row: add_coordinates(row, 'CityName', geocode_api_key), axis=1)

In [None]:
indices_with_none = df_cities_german[df_cities_german['Coordinates'] == (None, None)].index

In [None]:
# Retry geocoding for cities with None coordinates

for idx in indices_with_none:
    city_name = df_cities_german.loc[idx, 'CityName']
    print(f"Attempting to geocode: {city_name}") 
    try:

        time.sleep(1)
        
        new_coords = geocode_location_positionstack(city_name, geocode_api_key)
        print(f"Coordinates received: {new_coords}")  
        
        if new_coords is None or len(new_coords) != 2:
            raise ValueError(f"Invalid coordinates received: {new_coords}")
        
        df_cities_german.loc[idx, 'Latitude'] = new_coords[0]
        df_cities_german.loc[idx, 'Longitude'] = new_coords[1]

        df_cities_german.at[idx, 'Coordinates'] = (new_coords[0], new_coords[1])

    except Exception as e:
        print(f"Error geocoding {city_name}: {e}")
        traceback.print_exc() 
        df_cities_german.loc[idx, ['Latitude', 'Longitude']] = [None, None]
        df_cities_german.at[idx, 'Coordinates'] = (None, None)

**Add coordinates to capital cities worldwide**


In [75]:
df_countries_german[['Coordinates', 'Latitude', 'Longitude']] = df_countries_german.apply(lambda row: add_coordinates(row, 'CapitalName', geocode_api_key), axis=1)


In [None]:
# Add ISO2 codes for comparison
df_countries_german = df_countries_german.merge(german_iso_3166, left_on='Country', right_on='Name', how='left')
df_countries_german = df_countries_german.drop(['Name'], axis=1)


### Search Functions

#### Country Search

In [176]:
def find_countries_in_text(content, header, country_data, swiss_cities, swiss_cantons):
    mentioned_countries = set()

    # Analyze the Content using NLP
    doc = nlp(str(content))
    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            text_lower = ent.text.lower()

            # Check if the entity is a Swiss city or canton
            if text_lower in swiss_cities['CityName'].str.lower().values or text_lower in swiss_cantons['CantonName'].str.lower().values:
                mentioned_countries.add("schweiz")
                continue  

            # Check if the entity matches the capital name of a country
            country_from_capital = country_data[country_data['CapitalName'].str.lower() == text_lower]['CountryName']
            if not country_from_capital.empty:
                mentioned_countries.add(country_from_capital.iloc[0])
                continue 

            # Check if the entity matches the official name of a country
            country = country_data[country_data['CountryName'].str.lower() == text_lower]['CountryName']
            if not country.empty:
                mentioned_countries.add(country.iloc[0])
                continue 

            # Check if the entity matches any alternative name of a country
            for index, row in country_data.iterrows():
                other_names = row['OtherNames']
                if isinstance(other_names, list) and text_lower in [name.lower() for name in other_names]:
                    mentioned_countries.add(row['CountryName'])
                    break

    # Analyze the Header using simple text matching
    header_lower = header.lower()
    # Check if the header matches any Swiss city or canton
    if header_lower in swiss_cities['CityName'].str.lower().values:
        mentioned_countries.add("schweiz")
    elif header_lower in swiss_cantons['CantonName'].str.lower().values:
        mentioned_countries.add("schweiz")
    
    # Check if the header matches any capital city
    for index, row in country_data.iterrows():
        if header_lower == row['CapitalName'].lower():
            mentioned_countries.add(row['CountryName'])
            continue

    # Check if the header matches any official country name
    for index, row in country_data.iterrows():
        if header_lower == row['CountryName'].lower():
            mentioned_countries.add(row['CountryName'])
            continue

    # Check if the header matches any alternative country names
    for index, row in country_data.iterrows():
        other_names = row['OtherNames']
        if isinstance(other_names, list) and header_lower in [name.lower() for name in other_names]:
            mentioned_countries.add(row['CountryName'])

    return mentioned_countries

In [177]:
# Apply the combined function to the DataFrame
df_german['Mentioned_Countries'] = df_german.apply(lambda row: find_countries_in_text(row['Content'], row['Header'], df_countries_german, df_cities_german, df_cantons_german), axis=1)


#### Count Country Mentions

In [238]:
# Count and sort country mentions
country_counts = df_german.explode('Mentioned_Countries')['Mentioned_Countries'].value_counts()
df_country_counts = pd.DataFrame({'Country': country_counts.index, 'Count': country_counts.values})
df_country_counts.sort_values(by='Count', ascending=False, inplace=True)

In [240]:
df_country_count_data = df_country_counts.merge(df_countries_german, left_on='Country', right_on='CountryName', how='left')


#### City Search

In [244]:
# Function to find mentioned Swiss cities in text
def find_swiss_cities_in_text(content, header, df_cities_german):
    combined_text = normalize_text(str(content) + " " + str(header)).strip()
    mentioned_swiss_cities = set()
    for city in df_cities_german['CityName']:
        normalized_city = normalize_text(city).strip()
        if re.search(r'\b' + re.escape(normalized_city) + r'\b', combined_text):
            mentioned_swiss_cities.add(city)
    return mentioned_swiss_cities

In [245]:
# Apply the city search function to the German dataframe
df_german['Mentioned_Swiss_Cities'] = df_german.apply(lambda row: find_swiss_cities_in_text(row['Content'], row['Header'], df_cities_german), axis=1)

In [248]:
# Updating Mentioned Countries with Switzerland if Swiss Cities are Mentioned
switzerland_added = False  # Flag to track if 'Schweiz' is added
for idx, row in df_german.iterrows():
    if row['Mentioned_Swiss_Cities'] and "schweiz" not in row['Mentioned_Countries']:
        df_german.at[idx, 'Mentioned_Countries'] = row['Mentioned_Countries'].union({"schweiz"})


In [250]:
# Counting and Sorting Mentioned Swiss Cities
city_counts = df_german.explode('Mentioned_Swiss_Cities')['Mentioned_Swiss_Cities'].value_counts()
df_city_counts = pd.DataFrame({'City': city_counts.index, 'Count': city_counts.values})
df_city_counts.sort_values(by='Count', ascending=False, inplace=True)


In [254]:
# Merging City Count Data with Swiss Cities Data
df_city_count_data = df_city_counts.merge(df_cities_german, left_on='City', right_on='CityName', how='left')
df_city_count_data = df_city_count_data[['City', 'Count', 'Coordinates', 'Longitude', 'Latitude']]


 ### Exports

In [273]:
csv_file_path = 'output/data/df_cities_german_with_coordinates.csv'
df_cities_german.to_csv(csv_file_path, index=False)

In [266]:
csv_file_path = 'output/data/df_german_with_mentions.csv'
df_german.to_csv(csv_file_path, index=False)

In [269]:
csv_file_path = 'output/data/df_city_count_data_german.csv'
df_city_count_data.to_csv(csv_file_path, index=False)

In [271]:
csv_file_path = 'output/data/df_country_count_data_german.csv'
df_country_count_data.to_csv(csv_file_path, index=False)