# Shared Code

In [14]:
import pandas as pd

# Load main dataset and demonyms
df = pd.read_csv('all_validate.csv')
df_copy = df.copy()
demonyms_df = pd.read_csv('demonyms.csv', header=None)
demonym_map = dict(zip(demonyms_df[0].str.lower(), demonyms_df[1]))

# 1. GeoText

In [7]:
!pip install geotext

Collecting geotext
  Downloading geotext-0.4.0-py2.py3-none-any.whl.metadata (2.5 kB)
Downloading geotext-0.4.0-py2.py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geotext
Successfully installed geotext-0.4.0


## With Dymonyms

In [15]:
from geotext import GeoText

def get_country_geotext_with_demonyms(text):
    text = str(text).lower()
    places = GeoText(text)
    countries = places.countries
    if countries:
        return countries[0]
    for word in text.split():
        if word in demonym_map:
            return demonym_map[word]
    return None

df_copy['predicted_country'] = df_copy['title'].apply(get_country_geotext_with_demonyms)
found = df_copy['predicted_country'].notna().sum()
print(f"GeoText + Demonyms: Found {found}/{len(df_copy)} entries")
df_copy.to_csv('all_validate_geotext.csv', index=False)
print(df_copy[df_copy['predicted_country'].notna()].sample(10)[['title', 'predicted_country']])

GeoText + Demonyms: Found 5888/92444 entries
                                                   title predicted_country
70306  Cheese exposed to hip-hop tastes better, finds...       Switzerland
63749  Touch this fence and say hello to the Israeli ...            Israel
41688                    Nothing like a Vulcan Mind Meld            Vulcan
69162  U.S. spacecraft to take slingshot dive inside ...     United States
45014  Police: Warren teen caught having sex with wie...            Vienna
40297  I decided to take this pic of the caribbean Mo...         Caribbean
41551  If it quacks like a duck: boisterous poultry l...            France
38291  German soldier celebrates after successfully g...           Germany
52327  My Chinese manufactured Led Zeppelin LP with a...             China
11650  In 1920’s Germany, a man single-handedly lower...           Germany


## Without Dymonyms

In [16]:
from geotext import GeoText

def get_country_geotext(text):
    places = GeoText(str(text))
    return places.countries[0] if places.countries else None

df_copy['predicted_country'] = df_copy['title'].apply(get_country_geotext)
found = df_copy['predicted_country'].notna().sum()
print(f"GeoText Only: Found {found}/{len(df_copy)} entries")
df_copy.to_csv('all_validate_geotext_nodemonym.csv', index=False)
print(df_copy[df_copy['predicted_country'].notna()].sample(10)[['title', 'predicted_country']])

GeoText Only: Found 3397/92444 entries
                                                   title predicted_country
11269  'Yolocaust' artist provokes debate over commem...           Germany
85134       River basins of the contiguous United States     United States
26359  This lake in India is straight out of a horror...             India
74570  Ronald McDonald resting after an attack on Col...              Iraq
84052               Missles over Syria, Colorized (2018)             Syria
79853  Anesthesia Provision in the United States - It...     United States
91767  China and Russia are teaming up to ban illegal...             China
31883  A man in Australia has scooped a $1m (£536,000...         Australia
72505  U.S., Japan in talks to prevent China acquirin...             Japan
44815  Vatican launches $110 'click to pray' wearable...           Vatican


# 2. Geograpy3

In [17]:
!pip install git+https://github.com/JoshData/geograpy3.git

!pip install lxml[html_clean]
!pip install nltk

Collecting git+https://github.com/JoshData/geograpy3.git
  Cloning https://github.com/JoshData/geograpy3.git to /tmp/pip-req-build-dyfyv_lp
  Running command git clone --filter=blob:none --quiet https://github.com/JoshData/geograpy3.git /tmp/pip-req-build-dyfyv_lp
  fatal: could not read Username for 'https://github.com': No such device or address
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/JoshData/geograpy3.git[0m[32m [0m[32m/tmp/[0m[32mpip-req-build-dyfyv_lp[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/JoshData/

In [18]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

import geograpy as geograpy

text = "The prime minister of Canada met with officials from the United States and the United Kingdom."
places = geograpy.get_place_context(text=text)

print("Countries:", places.countries)
print("Regions:", places.regions)
print("Cities:", places.cities)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


ModuleNotFoundError: No module named 'geograpy'

## With Dymonyms

In [None]:
import pandas as pd
import geograpy

# Load datasets
df = pd.read_csv('all_validate.csv')
df_copy = df.copy()
demonyms_df = pd.read_csv('demonyms.csv', header=None)
demonym_map = dict(zip(demonyms_df[0].str.lower(), demonyms_df[1]))

# Geograpy + Demonyms
def get_country_geograpy_with_demonyms(text):
    text = str(text).lower()
    places = geograpy.get_place_context(text=text)
    countries = places.countries
    if countries:
        return countries[0]
    for word in text.split():
        if word in demonym_map:
            return demonym_map[word]
    return None

df_copy['predicted_country'] = df_copy['title'].apply(get_country_geograpy_with_demonyms)
found = df_copy['predicted_country'].notna().sum()
print(f"Geograpy + Demonyms: Found {found}/{len(df_copy)} entries")
df_copy.to_csv('all_validate_geograpy.csv', index=False)
print(df_copy[df_copy['predicted_country'].notna()].sample(10)[['title', 'predicted_country']])

# 4. SpacyEr

In [19]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## With Demonyms

In [20]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_country_spacy_with_demonyms(text):
    text = str(text)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            return ent.text
    for word in text.lower().split():
        if word in demonym_map:
            return demonym_map[word]
    return None

df_copy['predicted_country'] = df_copy['title'].apply(get_country_spacy_with_demonyms)
found = df_copy['predicted_country'].notna().sum()
print(f"spaCy + Demonyms: Found {found}/{len(df_copy)} entries")
df_copy.to_csv('all_validate_spacy.csv', index=False)
print(df_copy[df_copy['predicted_country'].notna()].sample(10)[['title', 'predicted_country']])

spaCy + Demonyms: Found 14639/92444 entries
                                                   title predicted_country
5861   Washing machine can’t believe how much deterge...               n’t
14224  Traditional Indian parents throw their son a h...             India
77377                             Take us to your leader     United States
70786            "Massacre in Korea" Pablo Picasso, 1951             Korea
87747  Farrakhan: Giuliani Grew Up a 'Privileged Crac...           Florida
68530    The Tostitos logo has two people dipping a chip          Tostitos
60568  Un cavalier de LBD de Gaza sur la route des Gaza.                la
2404     The USA announces our freedom from England 1776           England
85783  UK to back total ban on pesticides harmful to ...                UK
79399                                 The King of Queens            Queens


## Without Demonyms

In [21]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_country_spacy(text):
    doc = nlp(str(text))
    for ent in doc.ents:
        if ent.label_ == "GPE":
            return ent.text
    return None

df_copy['predicted_country'] = df_copy['title'].apply(get_country_spacy)
found = df_copy['predicted_country'].notna().sum()
print(f"spaCy Only: Found {found}/{len(df_copy)} entries")
df_copy.to_csv('all_validate_spacy_nodemonym.csv', index=False)
print(df_copy[df_copy['predicted_country'].notna()].sample(10)[['title', 'predicted_country']])

spaCy Only: Found 10666/92444 entries
                                                   title predicted_country
47313  PsBattle: Anti-austerity protester in Athens, ...            Athens
59214      TIFU by soaking myself in lingerie on Dropbox           Dropbox
7574   All 25 wanted individuals on Hawaii Island las...     Hawaii Island
77451  Adolf Hitler, leader of Nazi Germany, marches ...           Germany
39404  England forget to tick box to allow cyclist to...           England
38416               Best vending mashine ever in germany           germany
63763  David Ben Gurion, The 1st Prime Minister of Is...            Israel
88280  200 (1975) -- a psychedelic animated short by ...               USA
78836  Egypt: Man's 'drugs test trick' foiled by preg...             Egypt
38973  Tumblrina tries to embarrass Mike Pence at the...            Canada
