Ex 2A


In [1]:
!git clone https://github.com/Zainab1317/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4409, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 4409 (delta 12), reused 7 (delta 3), pack-reused 4379 (from 2)[K
Receiving objects: 100% (4409/4409), 19.31 MiB | 18.17 MiB/s, done.
Resolving deltas: 100% (24/24), done.


In [2]:
import re #imports regular expression to find text patterns
import os # imports to enable interaction with file system
import pandas as pd #imports to hadle tabular data and export tsv file

def write_tsv(rows, column_list, path): # function writes a list of data rows intotsv file using panadas
    df = pd.DataFrame(rows, columns=column_list) #list of rows is converted into panads DataFrame
    df.to_csv(path, sep="\t", index=False) #DataFrame is written into tsv

#Setup paths
repo_path = "FASDH25-portfolio2" #define path to repository
folder = "FASDH25-portfolio2/articles" #defining folder where articles are present
gazetteer_path = "FASDH25-portfolio2/gazetteers/geonames_gaza_selection.tsv" #define path and load gazetteer from the tsv file having place names and alternate names

#Read Gazetteer File
with open(gazetteer_path, encoding="utf-8") as file:
    data = file.read()

patterns = {} #an empty dictionary is created for each place names and a count for matches
rows = data.split("\n") #split gazetteer data in new lines to get eeach row on file

for row in rows[1:]:  # Skip header because the pattern starts from next row
    columns = row.split("\t") #seprates each column in tsv by tabs
    asciiname = columns[0] #ensures that first column has names for the place
    name_variants = [asciiname] #lists name variants consisting of the standard name
    alternate_names = columns[5].strip() #gets the alternate names from the 6th column which is counted as the 5th column, if present

    if alternate_names:
      alternate_list = alternate_names.split(",") #splits the alternate names with comma and gets list of other names
      for name in alternate_list: #loops through each alternate name in the list
        name_variants.append(name.strip()) #.strip will remove whitespace from alternate names and .append will add alternate names to the list if prenest

# build a regex pattern that will work to find all names and match diffrent varianats of the place names aswell
    regex_pattern = "|".join (re.escape(name) for name in name_variants) #using re.escape to escape any special characters in place names and "|" is used for alternation
    patterns[asciiname] = {"pattern": regex_pattern, "count": 0} #includes all names and their variants with numbers

mentions_per_month = {} #dictionary stores how many times each name was mentioned
war_start_date = "2023-10-07" #set the date to filt

for filename in os.listdir(folder):
    date_str = filename.split("_")[0]
    if date_str < war_start_date:
        continue

    file_path = f"{folder}/{filename}"
    with open(file_path, encoding="utf-8") as file:
        text = file.read()

    for place in patterns:
        pattern = patterns[place]["pattern"]
        matches = re.findall(pattern, text, re.IGNORECASE)
        count = len(matches)
        patterns[place]["count"] += count

        month_str = date_str[:7]

        if place not in mentions_per_month:
            mentions_per_month[place] = {}
        if month_str not in mentions_per_month[place]:
            mentions_per_month[place][month_str] = 0

        mentions_per_month[place][month_str] += count

#Print Output
for place in mentions_per_month:
    print(f'"{place}": {{')
    month_list = list(mentions_per_month[place].keys())
    for month in month_list:
        count = mentions_per_month[place][month]
        if month != month_list[-1]:
            print(f'    "{month}": {count},')
        else:
            print(f'    "{month}": {count}')
    print("},")

#Prepare Rows and Write TSV
output_rows = []
for place in mentions_per_month:
    for month in mentions_per_month[place]:
        count = mentions_per_month[place][month]
        output_rows.append((place, month, count))

write_tsv(output_rows, ["place", "month", "count"], "regex_counts.tsv")

"Jabalya": {
    "2023-10": 37,
    "2023-12": 46,
    "2024-02": 7,
    "2024-04": 3,
    "2024-03": 8,
    "2024-01": 12,
    "2023-11": 103
},
"Bayt Lahya": {
    "2023-10": 11,
    "2023-12": 8,
    "2024-02": 3,
    "2024-04": 2,
    "2024-03": 1,
    "2024-01": 2,
    "2023-11": 14
},
"Jabalia": {
    "2023-10": 33,
    "2023-12": 44,
    "2024-02": 7,
    "2024-04": 2,
    "2024-03": 8,
    "2024-01": 12,
    "2023-11": 102
},
"Bayt Hanun": {
    "2023-10": 19,
    "2023-12": 4,
    "2024-02": 3,
    "2024-04": 10,
    "2024-03": 7,
    "2024-01": 1,
    "2023-11": 17
},
"Khan Younis": {
    "2023-10": 74,
    "2023-12": 173,
    "2024-02": 63,
    "2024-04": 25,
    "2024-03": 40,
    "2024-01": 74,
    "2023-11": 132
},
"An Nusayrat": {
    "2023-10": 22,
    "2023-12": 26,
    "2024-02": 6,
    "2024-04": 8,
    "2024-03": 9,
    "2024-01": 11,
    "2023-11": 16
},
"Al Burayj": {
    "2023-10": 3,
    "2023-12": 17,
    "2024-02": 2,
    "2024-04": 0,
    "2024-03": 5,
    "2

Ex 2B

In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [4]:
#import libraries and import language model
import re
import stanza
import os
stanza.download("en")
#create pipeline and specify language
nlp = stanza.Pipeline(lang="en", processors="tokenize,mwt,ner")
#path to the repository
!git clone https://github.com/Zainab1317/FASDH25-portfolio2.git

corpus = "/content/FASDH25-portfolio2/articles"

files = os.listdir(corpus)

jan_2024_files = []
#loop through the entities putting them in a separate list
for file in files:
    if file.startswith("2024-01"):
       jan_2024_files.append(file)
#opening and reading the file
place_counts = {}
for file in jan_2024_files:
    filepath = f"{corpus}/{file}"
    with open(filepath, encoding="utf8") as file:
        text = file.read()
#isolating required entities (gpes and locs)
# 1) Chatgpt corrected code, varable consistency
    doc = nlp(text)
    for sentence in doc.sentences:
      for entity in sentence.ents:
        if entity.type in ["GPE", "LOC", "FAC"]:
          place = entity.text.strip()
          place_counts[place] = place_counts.get(place, 0) + 1

#cleaning the named entities
# 2) Help taken from Chatgpt to fix error
clean_counts = {}

for place, count in place_counts.items():
    place = re.sub(r"['`]s\b", "", place)

    place = re.sub(r"[^\w\s]", "", place)

    place = re.sub(r"^the\s+", "", place, flags=re.IGNORECASE)

    clean_counts[place] = clean_counts.get(place, 0) + count

# Write cleaned and sorted data to a TSV file
items = [[place, count] for place, count in clean_counts.items()]

with open("ner_counts.tsv", "w", encoding="utf-8") as file:
    file.write("place\tcount\n")
    for item in items:
        file.write(f"{item[0]}\t{item[1]}\n")

# Read and print the TSV file contents
with open("/content/ner_counts.tsv", encoding="utf-8") as file:
    print(file.read())


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


fatal: destination path 'FASDH25-portfolio2' already exists and is not an empty directory.
place	count
Israel	1594
Gaza	1605
Palestine	124
United States	160
Welchs	1
US	717
Iraq	62
West	24
Global South	2
Qatar	64
Gulf	10
Egypt	43
East Jerusalem	23
Netanyahus	7
Gaza Strip	156
South Africa	200
Russia	43
Ukraine	47
China	28
South Africas	8
Malaysia	8
Turkey	25
Jordan	42
Bolivia	4
Maldives	1
Namibia	10
Pakistan	24
Columbia	3
Khan Younis	24
Middle East	102
Hague	39
Bangladesh	2
Comoros	2
Djibouti	4
Netherlands	14
United Kingdom	43
Myanmar	6
Beirut	84
Dahiyeh	6
Lebanon	175
Iran	206
Yemen	182
Beiruts Shatila	1
Red Sea	250
Africa	29
Suez Canal	26
Gulf of Aden	27
Cape of Good Hope	12
Singapore	2
Mediterranean	11
Indian Ocean	2
Europe	30
Asia	18
Spain	7
Canada	42
Australia	12
Britain	14
Germany	31
Italy	10
Switzerland	9
Finland	3
Estonia	1
Japan	9
Austria	3
Romania	4
Israels	31
West Bank	160
Syria	83
October7	2
Jerusalem	26
Dearborn	12
Michigan	12
Mackinac Island	1
Great Lakes	1
Lake Michigan	1


Ex 3

In [None]:
!pip install stanza



In [5]:
import requests
import time

geonames_username = "zainab128"

def get_coordinates(place, username=geonames_username, fuzzy=0, timeout=1):

  # wait a short while, so that we don't overload the server:
  time.sleep(timeout)
  # make the API call:
  url = "http://api.geonames.org/searchJSON?"
  params = {"q": place, "username": username, "fuzzy": fuzzy, "maxRows": 1, "isNameRequired": True}
  response = requests.get(url, params=params)
  # convert the response into a dictionary:
  results = response.json()
  print(results)
  # get the first result:
  try:
    result = results["geonames"][0]
    return {"latitude": result["lat"], "longitude": result["lng"]}
  except (IndexError, KeyError):
    print("No results found for your API call", response.request.url)

import csv

filename = "ner_gazetteer.tsv"

with open(filename, mode="w", encoding="utf-8") as file:
  header = "place\tlatitude\tlongitude\n"
  file.write(header)
  with open("ner_counts.tsv", encoding="utf-8") as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
      place_name = row['place']
      coordinates = get_coordinates(place_name)
      if coordinates:
        latitude = coordinates["latitude"]
        longitude = coordinates["longitude"]
        row_data = f"{place_name}\t{latitude}\t{longitude}\n"
        file.write(row_data)
      else:
           row_data = f"{place_name}\tNA\tNA\n"
      file.write(row_data)


{'totalResultsCount': 33, 'geonames': [{'adminCode1': '00', 'lng': '34.75', 'geonameId': 294640, 'toponymName': 'State of Israel', 'countryId': '294640', 'fcl': 'A', 'population': 8883800, 'countryCode': 'IL', 'name': 'Israel', 'fclName': 'country, state, region,...', 'countryName': 'Israel', 'fcodeName': 'independent political entity', 'adminName1': '', 'lat': '31.5', 'fcode': 'PCLI'}]}
{'totalResultsCount': 40, 'geonames': [{'adminCode1': 'GZ', 'lng': '34.46672', 'geonameId': 281133, 'toponymName': 'Gaza', 'countryId': '6254930', 'fcl': 'P', 'population': 410000, 'countryCode': 'PS', 'name': 'Gaza', 'fclName': 'city, village,...', 'adminCodes1': {}, 'countryName': 'Palestine', 'fcodeName': 'seat of a first-order administrative division', 'adminName1': 'Gaza Strip', 'lat': '31.50161', 'fcode': 'PPLA'}]}
{'totalResultsCount': 49, 'geonames': [{'adminCode1': '00', 'lng': '35.20329', 'geonameId': 6254930, 'toponymName': 'Palestine', 'countryId': '6254930', 'fcl': 'A', 'population': 45690

KeyboardInterrupt: 

Ex 4a

In [None]:
# Importing the necessary library
import plotly.express as px
import pandas as pd

# Load frequency data
freq_df = pd.read_csv("regex_counts.tsv", sep="\t")

# Load geocoded coordinates
geo_df = pd.read_csv("ner_gazetteer.tsv", sep="\t")

# The expected merge column is 'place'
# Check if geo_df has 'name' instead of 'place'
if 'name' in geo_df.columns and 'place' not in geo_df.columns:
    geo_df.rename(columns={'name': 'place'}, inplace=True)

# merge the two tables, using the common column “asciiname”
merged_df = pd.merge(geo_df, freq_df, on="place")


# Plotly animated geo map
fig = px.scatter_geo(
    merged_df,
    lat="latitude",
    lon="longitude",
    size="count",
    color="place",
    hover_name="place",
    animation_frame="month",
    projection="natural earth",
    size_max=20
)
# Save interactive HTML map
fig.write_html("regex_map.html")
# Show the figure
fig.show()

FileNotFoundError: [Errno 2] No such file or directory: 'regex_counts.tsv'

Ex 4b

In [None]:
# Load NER frequency data (January 2024 only)
ner_df = pd.read_csv("ner_counts.tsv", sep="\t")

# Load geocoded coordinates
geo_df = pd.read_csv("ner_gazetteer.tsv", sep="\t")

# Merge the two tables using the common column "place"
merged_df = pd.merge(ner_df, geo_df, on="place")

merged_df['count'] = pd.to_numeric(merged_df['count'], errors='coerce')

# Drop rows with NaN values in 'count', 'latitude', or 'longitude'
merged_df = merged_df.dropna(subset=['count', 'latitude', 'longitude'])

# Create static and interactive maps
fig = px.scatter_geo(
    merged_df,
    lat="latitude",
    lon="longitude",
    size="count",
    color="place",
    hover_name="place",
    projection="natural earth",
    size_max=20,
    title="NER Place Frequencies - January 2024"
)

# Save interactive HTML map
fig.write_html("ner_map.html")

# Show the figure
fig.show()