# Location Analysis French

### Setup

**Important imports**

In [1]:
import pandas as pd
import spacy
import requests
import time
import re
import unicodedata

In [2]:
nlp = spacy.load("fr_core_news_sm")

In [25]:
df_french = pd.read_csv('../../scraping/data/extractor_all_articles_20minutes.csv')
df_cities_french = pd.read_csv('inputdata/listedesvilles_suisse.csv')
df_countries_french = pd.read_csv('inputdata/country_capital_french.csv')
data_german = pd.read_csv('inputdata/df_countries_german_with_capital_and_coordinates.csv')
df_cantons_french = pd.read_csv('inputdata/cantons.csv')

In [90]:
french_iso_3166 = pd.read_csv('inputdata/french-iso-3166.csv')
english_iso_3166 = pd.read_csv('inputdata/english-iso-3166.csv', usecols=['alpha-2', 'name', 'alpha-3'])
english_iso_3166.columns = ['Name', 'ISO2', 'ISO3']

In [205]:
df_cantons_french

Unnamed: 0,Canton
0,zurich
1,berne
2,lucerne
3,uri
4,schwytz
5,obwald
6,nidwald
7,glaris
8,zoug
9,fribourg


**Loading dataframes and cleaning them**

In [13]:
def normalize_text(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii').lower()

In [14]:
def apply_normalize_text_if_str(x):
    return normalize_text(x) if isinstance(x, str) else x

In [15]:
def lowercase_columns(df, columns):
    for column in columns:
        df[column] = df[column].str.lower()

In [27]:
columns_to_normalize = ['Content', 'Title', 'Header']
df_french[columns_to_normalize] = df_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [None]:
df_cantons_french['Canton'] = df_cantons_french['Canton'].apply(lambda x: normalize_text(x) if isinstance(x, str) else x)
df_cantons_french = df_cantons_french[['Canton']]

In [58]:
columns_to_normalize = ['Name', 'Capital']
df_countries_french[columns_to_normalize] = df_countries_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [61]:
columns_to_normalize = ['Name']
french_iso_3166[columns_to_normalize] = french_iso_3166[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [32]:
columns_to_normalize = ['Country', 'Capital']
df_countries_french[columns_to_normalize] = df_countries_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [36]:
df_cities_french['Ville'] = df_cities_french['Ville'].apply(lambda x: normalize_text(x) if isinstance(x, str) else x)
df_cities_french = df_cities_french[['Ville', 'Coordinates', 'Latitude', 'Longitude']]

In [67]:
df_countries_french = df_countries_french.merge(french_iso_3166, left_on='Country', right_on='Name', how='left')
df_countries_french = df_countries_french[['Country', 'Capital', 'ISO2', 'ISO3']]

In [22]:
columns_to_normalize = ['Title', 'Header', 'Content']
df_french[columns_to_normalize] = df_french[columns_to_normalize].applymap(apply_normalize_text_if_str)

In [23]:
df_french

Unnamed: 0,Title,Header,Content,Mentioned_Countries
0,dix-huit prevenus condamnes dans laffaire du c...,justice francaise,un dessein purement lucratif. dix-huit des 19 ...,"{royaume-uni, france}"
1,elle sengage sur lautoroute avec le reservoir ...,argovie,malgre un signal lumineux indiquant que le sto...,{suisse}
2,"au portugal, la course au poste de premier min...",politique,les principaux partis ont commence vendredi a ...,{portugal}
3,"le vote de journalistes, dont celui de la rts,...",ballon d'or,dans le debat haaland-messi pour le ballon d'o...,"{danemark, suisse, liban, panama}"
4,la premiere ministre refuse de hausser le sala...,bangladesh,"la premiere ministre du bangladesh, sheikh has...",{bangladesh}
...,...,...,...,...
1055,une tristesse indescriptible aux obseques de t...,france,nous taimons et nous ne toublierons jamais. pr...,{france}
1056,une voiture en feu a provoque la fermeture du ...,uri,le tunnel routier du gothard est reste ferme d...,{suisse}
1057,la commune voisine de paleo veut negocier des ...,trelex (vd),"ces derniers jours, le conseil communal de tre...",{}
1058,les patins crissent a nouveau au parc des bast...,ville de geneve,les mesures covid en 2021 puis les restriction...,{}


### Add Coordinates Functionality

In [42]:
geocode_api_key = '1983e85e9a97673a09ed6d19417dda0f'

In [43]:
def geocode_location_positionstack(location_name, api_key):
    base_url = "http://api.positionstack.com/v1/forward"
    params = {
        'access_key': api_key,
        'query': location_name,
        'limit': 1,
    }
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data['data']:
            latitude = data['data'][0]['latitude']
            longitude = data['data'][0]['longitude']
            return latitude, longitude
        else:
            return None, None
    else:
        print(f"Error: {response.status_code}")
        return None, None

In [44]:
def add_coordinates(row, column_name, api_key):
    location_name = row[column_name]
    lat, lng = geocode_location_positionstack(location_name, api_key)
    return pd.Series({'Coordinates': (lat, lng), 'Latitude': lat, 'Longitude': lng})

**Add coordinates to cities in Switzerland**

In [None]:
temp_df = df_cities_french[pd.isna(df_cities_french['Coordinates'])].copy()
temp_df[['Coordinates', 'Latitude', 'Longitude']] = temp_df.apply(lambda row: add_coordinates(row, 'Ville', geocode_api_key), axis=1)

In [133]:
for idx, row in temp_df.iterrows():
    df_cities_french.loc[idx, ['Coordinates', 'Latitude', 'Longitude']] = row[['Coordinates', 'Latitude', 'Longitude']]

**Add coordinates to capital cities worldwide**

In [116]:
df_countries_french[['Coordinates', 'Latitude', 'Longitude']] = df_countries_french.apply(lambda row: add_coordinates(row, 'Capital', geocode_api_key), axis=1)


Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400
Error: 400


In [174]:
for idx, row in df_countries_french.iterrows():
    if pd.isna(row['Latitude']):
        try:
            lat, lng = geocode_location_positionstack(row['Capital'], geocode_api_key)
            print(f"Fetching for {row['Capital']}: Lat {lat}, Lng {lng}")  # Debug print

            if lat is not None and lng is not None:
                df_countries_french.at[idx, 'Coordinates'] = (lat, lng)
                df_countries_french.at[idx, 'Latitude'] = lat
                df_countries_french.at[idx, 'Longitude'] = lng
        except Exception as e:
            print(f"Error fetching coordinates for {row['Capital']}: {e}")


Fetching for canberra: Lat -35.470057, Lng 148.954507
Fetching for caracas: Lat 10.505146, Lng -66.917878
Fetching for castries: Lat 14.008326, Lng -60.989868
Fetching for cetinje (presidentielle): Lat 42.392165, Lng 18.923124
Fetching for podgorica (administrative): Lat 42.44111, Lng 19.26361
Fetching for chisinau: Lat 47.020682, Lng 28.854719
Fetching for colombo (executive et judiciaire): Lat 6.934744, Lng 79.842725
Fetching for sri jayawardenapura kotte (officielle et legislative): Lat 6.907821, Lng 79.894725
Fetching for comayaguela (es): Lat 13.6, Lng -87.71667
Fetching for tegucigalpa: Lat 14.0818, Lng -87.20681
Fetching for conakry: Lat 9.51356, Lng -13.703869
Fetching for copenhague: Lat 55.704438, Lng 12.502119
Fetching for cotonou (de facto): Lat 6.36536, Lng 2.41833
Fetching for porto-novo (officielle): Lat 6.491262, Lng 2.625751
Fetching for dakar: Lat 14.713018, Lng -17.454726
Fetching for damas: Lat 33.5102, Lng 36.29128
Fetching for dacca: Lat 23.7104, Lng 90.40744
Fetc

### Search Functions

#### Country Search

In [72]:
def process_entity(ent, mentioned_countries, country_data_french, swiss_cities_french, swiss_cantons_french):
    if ent.label_ in ['LOC', 'GPE']:
        entity_parts = ent.text.lower().split()

        for part in entity_parts:
            if part in swiss_cities_french['Ville'].str.lower().values or part in swiss_cantons_french['Canton'].str.lower().values:
                mentioned_countries.add("suisse")
                break

        # Check if the entity is a capital, then add corresponding country
        country_from_capital = country_data_french[country_data_french['Capital'].str.lower() == ent.text.lower()]['Country']
        if not country_from_capital.empty:
            mentioned_countries.add(country_from_capital.iloc[0])
            return

        # Check if the entity is a country
        country = country_data_french[country_data_french['Country'].str.lower() == ent.text.lower()]['Country']
        if not country.empty:
            mentioned_countries.add(country.iloc[0])

In [73]:
def find_countries_in_text(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    # Use SpaCy for processing the content
    doc = nlp(content)
    mentioned_countries = set()

    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            process_entity(ent, mentioned_countries, country_data_french, swiss_cities_french, swiss_cantons_french)

    # Process the header with a keyword-based approach
    header_parts = header.lower().split()
    for part in header_parts:
        if part in swiss_cities_french['Ville'].str.lower().values or part in swiss_cantons_french['Canton'].str.lower().values:
            mentioned_countries.add("suisse")
            continue

        # Check if the part is a capital, then add corresponding country
        country_from_capital = country_data_french[country_data_french['Capital'].str.lower() == part]['Country']
        if not country_from_capital.empty:
            mentioned_countries.add(country_from_capital.iloc[0])
            continue

        # Check if the part is a country
        country = country_data_french[country_data_french['Country'].str.lower() == part]['Country']
        if not country.empty:
            mentioned_countries.add(country.iloc[0])

    return mentioned_countries

In [74]:
df_french['Mentioned_Countries'] = df_french.apply(lambda row: find_countries_in_text(row['Content'], row['Header'], df_countries_french, df_cities_french, df_cantons_french), axis=1)


In [79]:
df_country_counts

Unnamed: 0,Country,Count
0,suisse,398
1,etats-unis,130
2,france,121
3,russie,42
4,royaume-uni,37
...,...,...
95,seychelles,1
94,guatemala,1
93,guinee equatoriale,1
92,afghanistan,1


#### Count Country Mentions

In [77]:
country_counts = df_french.explode('Mentioned_Countries')['Mentioned_Countries'].value_counts()
df_country_counts = pd.DataFrame({'Country': country_counts.index, 'Count': country_counts.values})
df_country_counts.sort_values(by='Count', ascending=False, inplace=True)

In [87]:
df_country_count_data = df_country_counts.merge(df_countries_french, left_on='Country', right_on='Country', how='left')
df_country_count_data = df_country_count_data[['Country', 'Count', 'ISO2', 'ISO3', 'Capital', 'Coordinates', 'Latitude', 'Longitude']]


In [88]:
df_country_count_data

Unnamed: 0,Country,Count,ISO2,ISO3,Capital,Coordinates,Latitude,Longitude
0,suisse,398,CH,CHE,berne (de facto),"(53.156582, 8.385292)",53.156582,8.385292
1,etats-unis,130,US,USA,washington,"(38.82652, -77.01712)",38.826520,-77.017120
2,france,121,FR,FRA,paris,"(48.858705, 2.342865)",48.858705,2.342865
3,russie,42,RU,RUS,moscou,"(55.741469, 37.615561)",55.741469,37.615561
4,royaume-uni,37,GB,GBR,londres,"(51.509648, -0.099076)",51.509648,-0.099076
...,...,...,...,...,...,...,...,...
126,guatemala,1,GT,GTM,guatemala,"(14.64072, -90.51327)",14.640720,-90.513270
127,guinee equatoriale,1,GQ,GNQ,malabo,"(3.75, 8.78333)",3.750000,8.783330
128,afghanistan,1,AF,AFG,kaboul,"(34.52813, 69.17233)",34.528130,69.172330
129,montenegro,1,ME,MNE,cetinje (presidentielle),"(None, None)",,


**Add translation so that we have German and English Names (based on ISO Codes)**

In [84]:
df_countries_french

Unnamed: 0,Country,Capital,ISO2,ISO3,Coordinates,Latitude,Longitude
0,emirats arabes unis,abou dabi,AE,ARE,"(24.365909, 54.582942)",24.365909,54.582942
1,nigeria,abuja,NG,NGA,"(9.047364, 7.434935)",9.047364,7.434935
2,ghana,accra,GH,GHA,"(5.545935, -0.210089)",5.545935,-0.210089
3,turkmenistan,achgabat,TM,TKM,"(37.95, 58.38333)",37.950000,58.383330
4,ethiopie,addis-abeba,ET,ETH,"(9.018947, 38.746032)",9.018947,38.746032
...,...,...,...,...,...,...,...
209,cameroun,yaounde,CM,CMR,"(3.86667, 11.51667)",3.866670,11.516670
210,nauru,yaren (de facto),NR,NRU,"(-0.55085, 166.9252)",-0.550850,166.925200
211,croatie,zagreb,HR,HRV,"(45.806026, 15.976218)",45.806026,15.976218
212,kosovo,pristina,XK,,"(42.67272, 21.16688)",42.672720,21.166880


In [91]:
df_final_country_count_data = df_country_count_data.merge(english_iso_3166, on='ISO2', how='left')

# Rename the 'Name' column from english_iso_3166 to 'EnglishName'
df_final_country_count_data.rename(columns={'Name': 'EnglishName'}, inplace=True)
df_final_country_count_data.rename(columns={'Country': 'FrenchName'}, inplace=True)


In [94]:
desired_columns = [
    'FrenchName', 'Count', 'Coordinates', 'Longitude',
    'Latitude', 'ISO2', 'ISO3_x', 'EnglishName', 'Capital'
]


df_final_country_count_data = df_final_country_count_data[desired_columns]
df_final_country_count_data = df_final_country_count_data.rename(columns= {"ISO3_x": "ISO3"})


In [102]:
df_final_country_count_data

Unnamed: 0,FrenchName,Count,Coordinates,Longitude,Latitude,ISO2,ISO3,EnglishName,Capital
0,suisse,398,"(53.156582, 8.385292)",8.385292,53.156582,CH,CHE,Switzerland,berne (de facto)
1,etats-unis,130,"(38.82652, -77.01712)",-77.017120,38.826520,US,USA,United States of America,washington
2,france,121,"(48.858705, 2.342865)",2.342865,48.858705,FR,FRA,France,paris
3,russie,42,"(55.741469, 37.615561)",37.615561,55.741469,RU,RUS,Russian Federation,moscou
4,royaume-uni,37,"(51.509648, -0.099076)",-0.099076,51.509648,GB,GBR,United Kingdom of Great Britain and Northern I...,londres
...,...,...,...,...,...,...,...,...,...
126,guatemala,1,"(14.64072, -90.51327)",-90.513270,14.640720,GT,GTM,Guatemala,guatemala
127,guinee equatoriale,1,"(3.75, 8.78333)",8.783330,3.750000,GQ,GNQ,Equatorial Guinea,malabo
128,afghanistan,1,"(34.52813, 69.17233)",69.172330,34.528130,AF,AFG,Afghanistan,kaboul
129,montenegro,1,"(None, None)",,,ME,MNE,Montenegro,cetinje (presidentielle)


In [108]:
df_cities_french

Unnamed: 0,Ville,Coordinates,Latitude,Longitude
0,zurich,"(46.201511, 6.143889)",46.201511,6.143889
1,geneve,"(47.553808, 7.592036)",47.553808,7.592036
2,bale,"(46.520381, 6.63141)",46.520381,6.631410
3,lausanne,"(46.946416, 7.396394)",46.946416,7.396394
4,berne,"(47.499693, 8.728495)",47.499693,8.728495
...,...,...,...,...
165,appenzell,"(47.32812, 9.40792)",47.328120,9.407920
166,interlaken,"(36.955382, -121.745858)",36.955382,-121.745858
167,saint-moritz,"(46.498322, 9.843629)",46.498322,9.843629
168,la neuveville,"(47.062931, 7.09602)",47.062931,7.096020


#### City Search

In [109]:
def find_swiss_cities_in_text(content, header, df_cities_german):
    combined_text = normalize_text(str(content) + " " + str(header)).strip()
    mentioned_swiss_cities = set()

    for city in df_cities_german['Ville']:
        normalized_city = normalize_text(city).strip()
        # Use a regular expression to find whole word matches only
        if re.search(r'\b' + re.escape(normalized_city) + r'\b', combined_text):
            mentioned_swiss_cities.add(city)

    return mentioned_swiss_cities

In [110]:
df_french['Mentioned_Swiss_Cities'] = df_french.apply(lambda row: find_swiss_cities_in_text(row['Content'], row['Header'], df_cities_french), axis=1)


In [113]:
switzerland_added = False  # Flag to track if 'Switzerland' is ever added

for idx, row in df_french.iterrows():
    if row['Mentioned_Swiss_Cities'] and "suisse" not in row['Mentioned_Countries']:
        df_french.at[idx, 'Mentioned_Countries'] = row['Mentioned_Countries'].union({"suisse"})
        print(f"Row {idx} updated: {row}")
        switzerland_added = True

# Check if 'Switzerland' was added in any of the rows
if not switzerland_added:
    print("No updates made. 'suisse' was already included in all relevant rows.")

Row 41 updated: Title                     grands bandits pour le parquet, amateurs naifs...
Header                                             montagnes neuchateloises
Content                   a-t-elle affaire a une poignee de pieds nickel...
Mentioned_Countries                                                {suisse}
Mentioned_Swiss_Cities                                  {la chaux-de-fonds}
Name: 41, dtype: object
Row 49 updated: Title                     le troisieme entrainement a zermatt a nouveau ...
Header                                                            ski alpin
Content                   experte a meteo suisse, aude untersee avait vu...
Mentioned_Countries                                                {suisse}
Mentioned_Swiss_Cities                                            {zermatt}
Name: 49, dtype: object
Row 100 updated: Title                     a carouge, le plr garde son siege, a onex, le ...
Header                                            elections communa

In [114]:
city_counts = df_french.explode('Mentioned_Swiss_Cities')['Mentioned_Swiss_Cities'].value_counts()
df_city_counts = pd.DataFrame({'City': city_counts.index, 'Count': city_counts.values})
df_city_counts.sort_values(by='Count', ascending=False, inplace=True)

In [115]:
df_city_counts

Unnamed: 0,City,Count
0,geneve,101
1,zurich,49
2,berne,45
3,lausanne,44
4,fribourg,34
...,...,...
56,schlieren,1
55,weinfelden,1
54,lenzbourg,1
53,horgen,1


In [117]:
df_city_count_data = df_city_counts.merge(df_cities_french, left_on='City', right_on='Ville', how='left')
df_city_count_data = df_city_count_data[['City', 'Count', 'Coordinates', 'Longitude', 'Latitude']]

In [118]:
df_city_count_data

Unnamed: 0,City,Count,Coordinates,Longitude,Latitude
0,geneve,101,"(47.553808, 7.592036)",7.592036,47.553808
1,zurich,49,"(46.201511, 6.143889)",6.143889,46.201511
2,berne,45,"(47.499693, 8.728495)",8.728495,47.499693
3,lausanne,44,"(46.946416, 7.396394)",7.396394,46.946416
4,fribourg,34,"(47.10022, 6.825101)",6.825101,47.100220
...,...,...,...,...,...
75,schlieren,1,"(46.462626, 6.842412)",6.842412,46.462626
76,weinfelden,1,"(47.566894, 9.104073)",9.104073,47.566894
77,lenzbourg,1,"(47.388748, 8.17853)",8.178530,47.388748
78,horgen,1,"(47.261948, 8.596927)",8.596927,47.261948


### Exports

In [149]:
csv_file_path = 'inputdata/df_countries_french_with_capital_and_coordinates.csv'
df_countries_french.to_csv(csv_file_path, index=False)

In [3]:
df_countries_french = pd.read_csv('inputdata/df_countries_french_with_capital_and_coordinates.csv')

In [4]:
df_countries_french

Unnamed: 0,Country,Capital,ISO2,ISO3,Coordinates,Latitude,Longitude
0,emirats arabes unis,abou dabi,AE,ARE,"(24.365909, 54.582942)",24.365909,54.582942
1,nigeria,abuja,NG,NGA,"(9.047364, 7.434935)",9.047364,7.434935
2,ghana,accra,GH,GHA,"(5.545935, -0.210089)",5.545935,-0.210089
3,turkmenistan,achgabat,TM,TKM,"(37.95, 58.38333)",37.950000,58.383330
4,ethiopie,addis-abeba,ET,ETH,"(9.018947, 38.746032)",9.018947,38.746032
...,...,...,...,...,...,...,...
209,cameroun,yaounde,CM,CMR,"(3.86667, 11.51667)",3.866670,11.516670
210,nauru,yaren (de facto),NR,NRU,"(-0.55085, 166.9252)",-0.550850,166.925200
211,croatie,zagreb,HR,HRV,"(45.806026, 15.976218)",45.806026,15.976218
212,kosovo,pristina,XK,,"(42.67272, 21.16688)",42.672720,21.166880


In [69]:
csv_file_path = 'inputdata/df_temp.csv'
df_countries_french.to_csv(csv_file_path, index=False)

In [273]:
csv_file_path = 'data/df_german_with_mentions.csv'
df_german.to_csv(csv_file_path, index=False)

In [119]:
csv_file_path = 'data/df_city_count_data_french.csv'
df_city_count_data.to_csv(csv_file_path, index=False)

In [271]:
csv_file_path = 'data/df_country_count_data.csv'
df_country_count_data.to_csv(csv_file_path, index=False)

In [103]:
csv_file_path = 'data/df_translated_country_count_data_french.csv'
df_final_country_count_data.to_csv(csv_file_path, index=False)

### Debugs

In [None]:
test_row = df_german.iloc[0]


In [None]:
test_row = df_countries_french.iloc[10]
lat, lng = geocode_location_positionstack(test_row['Capital'], geocode_api_key)
print(f"Test - Capital: {test_row['Capital']}, Lat: {lat}, Lng: {lng}")

In [36]:
test_sentence = "geneve est une belle ville."
doc = nlp(test_sentence)

for ent in doc.ents:
    print(ent.text, ent.label_)


geneve LOC


In [35]:
"geneve" in df_cities_french['Ville'].str.lower().values


True

In [41]:
def find_countries_in_text_test(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    combined_text = str(content) + " " + str(header)
    doc = nlp(combined_text)
    mentioned_countries = set()

    for ent in doc.ents:
        print("Entity:", ent.text, "Label:", ent.label_)  # Debug print
        if ent.label_ in ['LOC', 'GPE']:
            city_match = ent.text.lower() in swiss_cities_french['Ville'].str.lower().values
            canton_match = ent.text.lower() in swiss_cantons_french['Canton'].str.lower().values
            print("City match:", city_match, "Canton match:", canton_match)  # Debug print

            if city_match or canton_match:
                mentioned_countries.add("suisse")
                continue
            # Rest of the code...

    return mentioned_countries


In [None]:
temp_df_french = df_french
temp_df_french['Mentioned_Countries'] = temp_df_french.apply(lambda row: find_countries_in_text_test(row['Content'], row['Header'], df_countries_french, df_cities_french, df_cantons_french), axis=1)


In [56]:
def find_countries_in_text_v2(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    combined_text = str(content) + " " + str(header)
    doc = nlp(combined_text)
    mentioned_countries = set()

    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            entity_parts = ent.text.lower().split()

            for part in entity_parts:
                city_match = part in swiss_cities_french['Ville'].str.lower().values
                canton_match = part in swiss_cantons_french['Canton'].str.lower().values

                if city_match or canton_match:
                    mentioned_countries.add("suisse")
                    break

                    country_from_capital = country_data_french[country_data_french['Capital'].str.lower() == ent.text.lower()]['Country']
            if not country_from_capital.empty:
                mentioned_countries.add(country_from_capital.iloc[0])
                continue

            country = country_data_french[country_data_french['Country'].str.lower() == ent.text.lower()]['Country']
            if not country.empty:
                mentioned_countries.add(country.iloc[0])

    print(f"Final mentioned countries: {mentioned_countries}")  # Debug print
    return mentioned_countries

test_content = "les patins crissent a nouveau au parc des bast à la ville de geneve..."
test_header = "ville de geneve les mesures covid en 2021 puis les restriction..."
print(find_countries_in_text(test_content, test_header, df_countries_french, df_cities_french, df_cantons_french))


Final mentioned countries: {'suisse'}
{'suisse'}


In [57]:
temp_df_french

Unnamed: 0,Title,Header,Content,Mentioned_Countries
0,dix-huit prevenus condamnes dans laffaire du c...,justice francaise,un dessein purement lucratif. dix-huit des 19 ...,"{royaume-uni, france}"
1,elle sengage sur lautoroute avec le reservoir ...,argovie,malgre un signal lumineux indiquant que le sto...,{suisse}
2,"au portugal, la course au poste de premier min...",politique,les principaux partis ont commence vendredi a ...,{portugal}
3,"le vote de journalistes, dont celui de la rts,...",ballon d'or,dans le debat haaland-messi pour le ballon d'o...,"{suisse, liban, panama, bosnie-herzegovine, no..."
4,la premiere ministre refuse de hausser le sala...,bangladesh,"la premiere ministre du bangladesh, sheikh has...",{bangladesh}
...,...,...,...,...
1055,une tristesse indescriptible aux obseques de t...,france,nous taimons et nous ne toublierons jamais. pr...,{france}
1056,une voiture en feu a provoque la fermeture du ...,uri,le tunnel routier du gothard est reste ferme d...,{suisse}
1057,la commune voisine de paleo veut negocier des ...,trelex (vd),"ces derniers jours, le conseil communal de tre...",{}
1058,les patins crissent a nouveau au parc des bast...,ville de geneve,les mesures covid en 2021 puis les restriction...,{}


In [60]:
# Trying different combinations of content and header
def test_combinations(content, header):
    combinations = [
        content + " " + header,
        content + ". " + header,
        content + "\n" + header
    ]

    for i, combined_text in enumerate(combinations, 1):
        doc = nlp(combined_text)
        print(f"Combination {i}:")
        for ent in doc.ents:
            print(ent.text, ent.label_)
        print("\n")

# Testing with varied combinations
test_combinations(test_content, test_header)


Combination 1:
parc des bast LOC
ville de geneve LOC


Combination 2:
parc des bast LOC


Combination 3:
parc des bast LOC
ville de geneve LOC




In [None]:
def find_countries_in_text_v2(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    combined_text = str(content) + "\n" + str(header)  # Combine with a newline
    doc = nlp(combined_text)
    mentioned_countries = set()

    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            entity_parts = ent.text.lower().split()  # Split the entity into parts

            for part in entity_parts:
                city_match = part in swiss_cities_french['Ville'].str.lower().values
                canton_match = part in swiss_cantons_french['Canton'].str.lower().values

                if city_match or canton_match:
                    mentioned_countries.add("suisse")
                    break

            # Rest of your original logic...

    return mentioned_countries


In [61]:
def find_countries_in_text_v2(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    combined_text = str(content) + "\n" + str(header)  # Combine with a newline
    doc = nlp(combined_text)
    mentioned_countries = set()

    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            entity_parts = ent.text.lower().split()  # Split the entity into parts

            for part in entity_parts:
                city_match = part in swiss_cities_french['Ville'].str.lower().values
                canton_match = part in swiss_cantons_french['Canton'].str.lower().values

                if city_match or canton_match:
                    mentioned_countries.add("suisse")  # French for Switzerland
                    break

            # Check if the entity is a capital, then add corresponding country
            country_from_capital = country_data_french[country_data_french['Capital'].str.lower() == ent.text.lower()]['Country']
            if not country_from_capital.empty:
                mentioned_countries.add(country_from_capital.iloc[0])
                continue

            # Check if the entity is a country
            country = country_data_french[country_data_french['Country'].str.lower() == ent.text.lower()]['Country']
            if not country.empty:
                mentioned_countries.add(country.iloc[0])

    return mentioned_countries


In [68]:
def find_countries_in_text_v2(content, header, country_data_french, swiss_cities_french, swiss_cantons_french):
    combined_text = str(content) + "\n" + str(header)
    doc = nlp(combined_text)
    mentioned_countries = set()

    print(f"Combined Text: {combined_text}")  # Debug print
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")  # Debug print

        if ent.label_ in ['LOC', 'GPE']:
            entity_parts = ent.text.lower().split()
            for part in entity_parts:
                print(f"Checking part: {part}")  # Debug print

                city_match = part in swiss_cities_french['Ville'].str.lower().values
                canton_match = part in swiss_cantons_french['Canton'].str.lower().values
                print(f"City match: {city_match}, Canton match: {canton_match}")  # Debug print

                if city_match or canton_match:
                    mentioned_countries.add("suisse")
                    break

            # Check if the entity is a capital, then add corresponding country
            country_from_capital = country_data_french[country_data_french['Capital'].str.lower() == ent.text.lower()]['Country']
            if not country_from_capital.empty:
                mentioned_countries.add(country_from_capital.iloc[0])
                continue

            # Check if the entity is a country
            country = country_data_french[country_data_french['Country'].str.lower() == ent.text.lower()]['Country']
            if not country.empty:
                mentioned_countries.add(country.iloc[0])

    return mentioned_countries

# Test the function with the specific text that is causing issues
test_content = "les patins crissent a nouveau au parc des bast..."
test_header = "ville de geneve les mesures covid en 2021 puis les restriction..."
print(find_countries_in_text_v2(test_content, test_header, df_countries_french, df_cities_french, df_cantons_french))


Combined Text: les patins crissent a nouveau au parc des bast...
ville de geneve les mesures covid en 2021 puis les restriction...
Entity: parc des bast, Label: LOC
Checking part: parc
City match: False, Canton match: False
Checking part: des
City match: False, Canton match: False
Checking part: bast
City match: False, Canton match: False
set()
