# Stanza for NER Q 2B

## Install stanza
the library from stanfor for NLP and NER

In [1]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

## Import library and download language model

After installing it, we import stanza into our notebook.

In [2]:
import stanza

## Creating the pipeline

Download the English language model and build the pipeline (we specify that it should only tokenize the text, separate multiword tokens and perform Named Entity Recognition):


In [3]:
# Download the language model:
stanza.download("en")

# Create the pipeline, specifying the language:
nlp = stanza.Pipeline(lang="en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


##cloning the portfolio repository

In [4]:
!git clone https://github.com/aamnamemonhyder/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4358, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4358 (delta 1), reused 0 (delta 0), pack-reused 4355 (from 2)[K
Receiving objects: 100% (4358/4358), 17.77 MiB | 15.71 MiB/s, done.
Resolving deltas: 100% (7/7), done.


#Defining file path and loop over files to find GPE entities

In [6]:
import os

Places={}

articles_folder='/content/FASDH25-portfolio2/articles'

for filename in os.listdir(articles_folder):
  if filename[:7]=='2024-01':
    file_path=os.path.join(articles_folder, filename)
    with open(file_path,'r', encoding='utf-8') as file:
      text=file.read()
      doc=nlp(text)
      for entity in doc.entities:
        if entity.type =='GPE':
          Places[entity.text]=Places.get(entity.text,0)+ 1
print(Places)

{'West Bank': 120, 'Dura': 2, 'Hebron': 10, 'Tulkarem': 2, 'Gaza Strip': 31, 'the West Bank': 40, 'Israel': 1593, 'Nablus': 5, 'the United States': 97, 'United Kingdom': 12, 'Yemen': 182, 'Iran': 206, 'Gaza': 1605, 'US': 706, 'Sanaa': 15, 'Saudi Arabia': 39, 'Aden': 3, 'Tel Aviv': 49, 'the Gaza Strip': 123, 'UK': 95, 'Palestine': 124, 'Marib': 3, 'the United Arab Emirates': 13, 'Turkey': 25, 'Jordan': 42, 'Qatar': 64, 'UAE': 7, 'Charleston': 1, 'South Carolina': 4, 'Gaza City': 31, 'Doha': 19, 'Hong Kong': 2, 'South Africa': 197, 'the State of Palestine': 1, 'Lebanon': 175, 'Hague': 6, 'Pretoria': 8, 'Uganda': 11, 'China': 28, 'Russia': 43, 'The Hague': 33, 'Kuwait': 2, 'Gaza’s': 16, 'Ukraine': 47, 'Canada': 42, 'Montreal': 1, 'Milton, Ontario': 1, 'Jabalia': 11, 'Israel’s': 31, 'Ottawa': 3, 'Egypt': 43, 'Rafah': 40, 'Toronto': 1, 'Calgary': 1, 'Afghanistan': 7, 'Austria': 3, 'Australia': 12, 'Finland': 3, 'Germany': 31, 'Italy': 10, 'Japan': 9, 'Netherlands': 14, 'Iceland': 1, 'Sweden

### Cleaning the placenames

1. Israel - is labelled as NORP, GPE
2. Gulf, ME, Africa, Gulf, Gulf of Aden, Red Sea, West are listed as LOC
3. Hezbollah's, Hamas's - as GPE

In [9]:
import re
CleanedPlaces={}
for name,count in Places.items():
  cleaned_name=name.lower().strip()
  cleaned_name=re.sub(r"'s$|’s$", '', cleaned_name)
  CleanedPlaces[cleaned_name]=CleanedPlaces.get(cleaned_name,0) +count
def get_count(item):
  return item[1]
sorted_places=sorted(CleanedPlaces.items(), key=get_count, reverse=True)
for place, count in sorted_places:
  print(f"{place}:{count}")

israel:1625
gaza:1621
us:706
iran:209
south africa:203
yemen:188
lebanon:178
the gaza strip:124
palestine:124
west bank:121
the united states:118
uk:95
beirut:87
syria:84
qatar:65
iraq:64
washington:62
tel aviv:51
india:50
ukraine:47
egypt:44
jordan:43
russia:43
canada:42
the west bank:40
rafah:40
united states:40
saudi arabia:39
gaza strip:34
the hague:33
gaza city:32
germany:31
the united kingdom:31
china:30
jerusalem:26
turkey:25
tehran:25
ramallah:24
pakistan:24
east jerusalem:23
khan younis:23
doha:19
jenin:19
london:17
damascus:17
belgium:16
sanaa:15
the united arab emirates:14
netherlands:14
france:14
deir el-balah:14
dc:14
britain:14
morocco:14
australia:13
erbil:13
united kingdom:12
uganda:12
strip:12
dearborn:12
michigan:12
iowa:12
akrotiri:12
jabalia:11
norway:11
u.s.:11
bahrain:11
nuseirat:11
hebron:10
italy:10
namibia:10
japan:9
switzerland:9
cyprus:9
west beirut:9
pretoria:8
paris:8
malaysia:8
manipur:8
mizoram:8
baghdad:8
haifa:8
greece:8
uae:7
afghanistan:7
spain:7
alge

### Make a TSV

In [14]:
import pandas as pd

def write_tsv(data, column_list,path):
  items=list(data.items())
  df=pd.DataFrame.from_records(items, columns=column_list)
  df.to_csv(path, sep='\t', index=False)

columns=['NER', 'COUNT']
Q2b_filename='ner_counts_aamna.tsv'
write_tsv(CleanedPlaces, columns, Q2b_filename)

### print tsv

In [16]:
with open(Q2b_filename, 'r', encoding='utf-8') as file:
  print(file.read())

NER	COUNT
west bank	121
dura	2
hebron	10
tulkarem	2
gaza strip	34
the west bank	40
israel	1625
nablus	5
the united states	118
united kingdom	12
yemen	188
iran	209
gaza	1621
us	706
sanaa	15
saudi arabia	39
aden	3
tel aviv	51
the gaza strip	124
uk	95
palestine	124
marib	3
the united arab emirates	14
turkey	25
jordan	43
qatar	65
uae	7
charleston	1
south carolina	4
gaza city	32
doha	19
hong kong	2
south africa	203
the state of palestine	1
lebanon	178
hague	6
pretoria	8
uganda	12
china	30
russia	43
the hague	33
kuwait	2
ukraine	47
canada	42
montreal	1
milton, ontario	1
jabalia	11
ottawa	3
egypt	44
rafah	40
toronto	1
calgary	1
afghanistan	7
austria	3
australia	13
finland	3
germany	31
italy	10
japan	9
netherlands	14
iceland	1
sweden	3
switzerland	9
romania	4
the united kingdom	31
washington, dc	4
jerusalem	26
gretna	2
louisiana	3
new orleans	5
@mirandacleland	1
east jerusalem	23
scotland	2
ireland	3
norway	11
#october7	2
syria	84
jenin	19
ramallah	24
abwein	1
qalqilya	2
jericho	1
u.s.	11
bahr