In [None]:
import pandas as pd
import spacy
import deepl
import requests
import folium
import plotly.express as px

In [None]:
# Initialize DeepL translator

api_key = "6780cef2-4c9d-787f-4999-9649bd278538:fx"
translator = deepl.Translator(api_key)

In [None]:
# Load SpaCy language models for German
nlp1 = spacy.load("de_core_news_sm")
nlp2 = spacy.load('de_core_news_md')
nlp3 = spacy.load('de_core_news_lg')

In [None]:
# Read data from CSV files

df_german = pd.read_csv('../scraping/data/extractor_all_articles_20minuten.csv')
df_french = pd.read_csv('../scraping/data/extractor_all_articles_20minutes.csv')
df_italian = pd.read_csv('../scraping/data/extractor_all_articles_20minuti.csv')

In [None]:
# Load a list of known locations
dict_all_countries = pd.read_csv('dicts/allCountries.csv')
known_locations = set(dict_all_countries['LocationName'])
dict_all_countries['BiggerEntity'] = dict_all_countries['CountryName'].str.split('/').str[-1]

In [None]:
# Define a function to translate text to German using DeepL
def translate_to_german(text):
    try:
        result = translator.translate_text(text, target_lang="DE")
        return result.text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

In [None]:
# Translate unique location names to German
unique_location_names = dict_all_countries['BiggerEntity'].unique()
translated_names = {name: translate_to_german(name) for name in unique_location_names}
name_translation_mapping = {original: translated_names[original] for original in unique_location_names}
dict_all_countries['BiggerEntity_German'] = dict_all_countries['BiggerEntity'].map(name_translation_mapping)
new_locations = set(dict_all_countries['BiggerEntity_German'].unique())
known_locations.update(new_locations)

In [None]:
# Function to extract locations with voting system, converting text to lowercase

def extract_locations_with_voting_system(text, known_locations):
    text = text.lower()
    
    doc1 = nlp1(text)
    doc2 = nlp2(text)
    doc3 = nlp3(text)
    
    locations1 = {ent.text.lower() for ent in doc1.ents if ent.label_ == 'LOC'}
    locations2 = {ent.text.lower() for ent in doc2.ents if ent.label_ == 'LOC'}
    locations3 = {ent.text.lower() for ent in doc3.ents if ent.label_ == 'LOC'}
    
    all_locations = locations1 | locations2 | locations3
    final_locations = {loc for loc in all_locations if sum([loc in locations for locations in [locations1, locations2, locations3]]) >= 2}
    
    lower_known_locations = {loc.lower() for loc in known_locations}
    
    final_known_locations = {loc for loc in final_locations if loc in lower_known_locations}
    
    return final_known_locations

In [None]:
# Function to merge similar location entities
def merge_similar_entities(entities):
    merged_entities = set()
    for entity in entities:
        if not any(e for e in merged_entities if entity in e or e in entity):
            merged_entities.add(entity)
    return merged_entities


In [None]:
def normalize_location(loc):
    articles = {'der', 'die', 'das', 'den'}
    words = loc.split()
    normalized_words = [word for word in words if word.lower() not in articles]
    return ' '.join(normalized_words)

In [None]:
lower_known_locations = {location.lower() for location in known_locations if location is not None}
df_german['Content'] = df_german['Content'].str.lower()

In [None]:
# Apply location extraction with voting system
df_german['Voted_Locations'] = df_german['Content'].apply(lambda text: extract_locations_with_voting_system(text, lower_known_locations))


In [None]:
# Create a bar chart showing location counts
voted_location_counts_new = df_german['Voted_Locations'].explode().value_counts()


In [None]:
# Remove specific entries from the set of location counts
entries_to remove = {'st.', 'bund'}
voted_location_counts_new = {entry for entry in voted_location_counts_new if entry not in entries_to_remove}


In [None]:
# Create a DataFrame with location counts
df_voted_locations = voted_location_counts_new.reset_index()
df_voted_locations.columns = ['LocationName', 'Count']


In [None]:
# Remove specific entries from the DataFrame
entries_to_remove = {'st.', 'tiktok', 'us-präsident', 'problem', 'bund', 'hause', 'land', 'erde', 'aa', 'instagram', 'bundesgericht', 'ki', 'bewohner', 'züri fäscht', 'gesamt-skigebiet', 'bildstrecke'}
df_voted_locations = df_voted_locations[~df_voted_locations['LocationName'].isin(entries_to_remove)]


In [None]:
# Save the location counts to a CSV file
file_path = 'data/occuring_locations_german.csv'
df_voted_locations.to_csv(file_path, index=False)

# Create a bar chart using Plotly Express to visualize the location counts
fig = px.bar(df_top20, x='LocationName', y='Count',
             title='Top 20 Mentioned Locations',
             labels={'LocationName': 'Location Name', 'Count': 'Count'},
             color='Count')

# Show the figure
fig.show()
