# Using stanza for Named Entity Recognition (continued)

## Installation

Run the code cell below to install stanza:

In [1]:
 #installing stanza
 !pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

## Import library and download language model

After installing it, we import stanza into our notebook.

In [2]:
import stanza
import os
import time


## Creating the pipeline

Download the English language model and build the pipeline (we specify that it should only tokenize the text, separate multiword tokens and perform Named Entity Recognition):


In [3]:
# Download the language model:
stanza.download("en")

# Create the pipeline, specifying the language:
nlp = stanza.Pipeline(lang="en", processors='tokenize,mwt,ner')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


#cloning the repository
!git clone https://github.com/ZeeshanKarim-916/FASDH25-portfolio2.git




In [4]:
#Clonning the repository
!git clone https://github.com/ZeeshanKarim-916/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4365, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 4365 (delta 3), reused 1 (delta 1), pack-reused 4356 (from 2)[K
Receiving objects: 100% (4365/4365), 17.78 MiB | 18.55 MiB/s, done.
Resolving deltas: 100% (9/9), done.


#Extracting the 2024 Articles only


In [5]:
# setting the file pathway of the articles folder
path = "/content/FASDH25-portfolio2/articles"
#list the files in the folder and only use the articles from January 2024
files = os.listdir(path)
# Keep articles from Jan 2024 only
jan_files = [f for f in files if f.startswith("2024-01")]
# Show how many were found
print("January files found:", len(jan_files))


January files found: 326


##Looping through January 2024 files

In [7]:
import os

# Create a dictionary to store place name counts

places = {}

# writing a filepath to our cloned repository's articles
folder = "/content/FASDH25-portfolio2/articles"
jan_files_articles_count = 0 # keep count of January articles.

# Loop through files that begin with "2024-01-"
for filename in os.listdir(folder):
    if filename.startswith("2024-01"):
      jan_files_articles_count += 1
  # create a path to the file:
      path = os.path.join(folder,filename)
  # open and read the file:
      with open(path, encoding="utf-8") as file:
          text = file.read()
    # use the nlp pipeline to analyse the text:
          doc = nlp(text)
    # select only the entities that are place names:
          for e in doc.entities:
              if e.type in ["GPE", "LOC"]:

        # add 1 to the count of the place in our dictionary
        # (and/or add the place to the dictionary if it was not there yet):
                  place = e.text.strip()
                  places[place] = places.get(place, 0) + 1
print("Articles from January 2024:", jan_files_articles_count)
print(places)


Articles from January 2024: 326
{'Gaza': 1605, 'Israel': 1593, 'South Africa': 200, 'Palestine': 124, 'Dublin': 3, 'The Hague': 33, 'Russia': 43, 'Ukraine': 47, 'Moscow': 4, 'US': 706, 'UK': 95, 'West': 24, 'the Global South': 2, 'Ramallah': 24, 'West Bank': 120, 'Gaza Strip': 31, 'the Gaza Strip': 123, 'Israel’s': 31, 'the United States': 97, 'the West Bank': 40, 'East Jerusalem': 23, 'PA': 1, 'Oslo': 2, 'Jerusalem': 26, 'Middle East': 25, 'United States’': 2, 'the Middle East': 77, 'Bahrain': 11, 'Turkey': 25, 'Greece': 8, 'Jordan': 42, 'Qatar': 64, 'the United Arab Emirates': 13, 'Saudi Arabia': 39, 'Egypt': 43, 'Tel Aviv': 49, 'America': 4, 'South Carolina': 4, 'Lebanon': 175, 'Beirut': 84, 'Washington': 60, 'Iran': 206, 'Asia': 18, 'al-Dabshah': 1, 'Khirbet Selm': 1, 'Bint Jbeil': 1, 'Syria': 83, 'Ibil El Saqi': 1, 'al-Tawil’s': 1, 'al-Khader': 1, 'the Bekaa Valley': 1, 'the Litani River': 1, 'Red Sea': 50, 'Alborz': 4, 'the Red Sea': 194, 'Bab al-Mandeb Strait': 4, 'the Gulf of A

### Cleaning the Places Names




In [11]:
import re
import os

normalized_places = {}

# Standard naming conventions dictionary
standard_names = {
    'beruit': 'Beirut',
    'britain': 'United Kingdom',
    'dahiyeb': 'Dahiyeh',
    'gaza': 'Gaza',
    'gaza city': 'Gaza',
    'gaza strip': 'Gaza',
    'islamic republic of iran': 'Iran',
    'republic of yemen': 'Yemen',
    'state of israel': 'Israel',
    'state of palestine': 'Palestine',
    'tel israel': 'Tel Aviv',
    'uae': 'United Arab Emirates',
    'u.s.': 'United States',
    'uk': 'United Kingdom',
    'usa': 'United States',
    'westbank': 'West Bank'
}

for place, count in places.items():
    # Remove possessives like 's
    place = re.sub(r"[’'`]s\b", "", place)
    # Convert newlines to spaces
    place = place.replace('\n', ' ')

    # Remove punctuation
    place = re.sub(r"[^\w\s]", "", place)

    # Remove leading 'the' if it appears
    place = re.sub(r"^the\s+", "", place, flags=re.IGNORECASE)

    # Check for Gaza (special case)
    if re.search(r'gaza', place.lower()):
        place = standard_names['gaza']
    else:
        # Lookup normalized place (lowercase for safety)
        place = standard_names.get(place.lower(), place)

    # Merge counts for normalized places
    if place in normalized_places:
        normalized_places[place] += count
    else:
        normalized_places[place] = count

# Print the cleaned and aggregated place names with counts
print(normalized_places)


{'Gaza': 1830, 'Israel': 1632, 'South Africa': 208, 'Palestine': 125, 'Dublin': 3, 'Hague': 39, 'Russia': 43, 'Ukraine': 47, 'Moscow': 4, 'US': 717, 'United Kingdom': 152, 'West': 24, 'Global South': 2, 'Ramallah': 24, 'West Bank': 164, 'United States': 162, 'East Jerusalem': 23, 'PA': 1, 'Oslo': 2, 'Jerusalem': 26, 'Middle East': 102, 'Bahrain': 11, 'Turkey': 25, 'Greece': 8, 'Jordan': 43, 'Qatar': 65, 'United Arab Emirates': 21, 'Saudi Arabia': 39, 'Egypt': 44, 'Tel Aviv': 52, 'America': 4, 'South Carolina': 4, 'Lebanon': 178, 'Beirut': 88, 'Washington': 62, 'Iran': 210, 'Asia': 18, 'alDabshah': 1, 'Khirbet Selm': 1, 'Bint Jbeil': 1, 'Syria': 84, 'Ibil El Saqi': 1, 'alTawil': 1, 'alKhader': 1, 'Bekaa Valley': 1, 'Litani River': 1, 'Red Sea': 250, 'Alborz': 4, 'Bab alMandeb Strait': 9, 'Gulf of Aden': 27, 'Indian Ocean': 2, 'Africa': 29, 'Yemen': 189, 'Damascus': 17, 'Djibouti': 4, 'Tehran': 25, 'Sanaa': 15, 'Rafah': 40, 'Khreis': 1, 'Strip': 15, 'Deir elBalah': 14, 'Maghazi': 5, 'Nus

### Store Data in TSV File




In [12]:
filename = "ner_counts.tsv"

# Open the file in writing mode and with UTF-8 encoding
with open(filename, mode="w", encoding="utf-8") as file:
    # Write header row
    header = "Place\tCount\n"
    file.write(header)

    # Loop through the cleaned place counts and write each row
    for place, count in normalized_places.items():
        row = f"{place}\t{count}\n"
        file.write(row)


We can improve the readability by adding xml-style opening and closing tags (e.g., `<GPE>Rafah</GPE>`) instead of only a tag at the beginning of the entity. Adapt the code below so that it adds xml-style start and end tags:

In [13]:
with open("/content/ner_counts.tsv", encoding="utf-8") as file:
  print(file.read())

Place	Count
Gaza	1830
Israel	1632
South Africa	208
Palestine	125
Dublin	3
Hague	39
Russia	43
Ukraine	47
Moscow	4
US	717
United Kingdom	152
West	24
Global South	2
Ramallah	24
West Bank	164
United States	162
East Jerusalem	23
PA	1
Oslo	2
Jerusalem	26
Middle East	102
Bahrain	11
Turkey	25
Greece	8
Jordan	43
Qatar	65
United Arab Emirates	21
Saudi Arabia	39
Egypt	44
Tel Aviv	52
America	4
South Carolina	4
Lebanon	178
Beirut	88
Washington	62
Iran	210
Asia	18
alDabshah	1
Khirbet Selm	1
Bint Jbeil	1
Syria	84
Ibil El Saqi	1
alTawil	1
alKhader	1
Bekaa Valley	1
Litani River	1
Red Sea	250
Alborz	4
Bab alMandeb Strait	9
Gulf of Aden	27
Indian Ocean	2
Africa	29
Yemen	189
Damascus	17
Djibouti	4
Tehran	25
Sanaa	15
Rafah	40
Khreis	1
Strip	15
Deir elBalah	14
Maghazi	5
Nuseirat	11
Europe	30
Manchester City	1
Barcelona	1
El Arish	3
Italy	10
Iraq	64
Eastern Mediterranean	1
New York City	1
DC	14
Cape Town	2
Pretoria	8
South Africansfrom	1
Africa4Palestine	1
Johannesburg	4
Western Cape province	1
Aqaba	1
Cape 

The file will now be stored in our colab's session environment. You can see it by clicking the folder icon in the left-hand tool bar in colab. Double-click it to view it in colab. Right-click it and choose "Download" to download the file.

To access it in your script, use the path `/content/ner_counts.tsv`

In [None]:
import requests
import time

geonames_username = "alihasnain"

def get_coordinates(place, username=geonames_username, fuzzy=0, timeout=1):
    """This function gets a single set of coordinates from the geonames API."""
    time.sleep(timeout)  # Wait to avoid overloading the server
    url = "http://api.geonames.org/searchJSON?"
    params = {"q": place, "username": username, "fuzzy": fuzzy, "maxRows": 1, "isNameRequired": True}
    response = requests.get(url, params=params)
    results = response.json()

    try:
        result = results["geonames"][0]
        return {"latitude": result["lat"], "longitude": result["lng"]}
    except (IndexError, KeyError):
        return {"latitude": "NA", "longitude": "NA"}  # Return "NA" if no coordinates found

# get the place names from the tsv file
place = []

# reads the tsv file
with open("/content/ner_counts.tsv", 'r', encoding="utf-8") as file:
    lines = file.readlines()

header = lines[0].strip().split('\t')
place_index = header.index('placename')

# loop through the rest of lines
for line in lines[1:]:
    columns = line.strip().split('\t')
    if len(columns) > place_index:
        place.append(columns[place_index])

# get the coordinates
coordinates_data = []
for place_name in place:
    coordinates = get_coordinates(place_name)
    coordinates_data.append({'Place': place_name, 'Latitude': coordinates['latitude'], 'Longitude': coordinates['longitude']})

    # Print the coordinates of each place
    print(f"{place_name}: {coordinates['latitude']}, {coordinates['longitude']}")

# write coordinates to tsv file
filename = "NER_gazetteer.tsv"
with open(filename, 'w', encoding="utf-8") as file:
    file.write('Place\tLatitude\tLongitude\n')
    # Iterate through the list of dictionaries and write to the file
    for row in coordinates_data:
        file.write(f"{row['Place']}\t{row['Latitude']}\t{row['Longitude']}\n")

print("Coordinates written to NER_gazetteer.tsv")


FileNotFoundError: [Errno 2] No such file or directory: '/content/ner_counts.tsv'

In [None]:
from google.colab import files
uploaded = files.upload()


KeyboardInterrupt: 