Ex 2A


In [None]:
!git clone https://github.com/Zainab1317/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4427, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 4427 (delta 4), reused 2 (delta 2), pack-reused 4415 (from 3)[K
Receiving objects: 100% (4427/4427), 19.37 MiB | 15.22 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [None]:
import re #imports regular expression to find text patterns
import os # imports to enable interaction with file system
import pandas as pd #imports to hadle tabular data and export tsv file

def write_tsv(rows, column_list, path): # function writes a list of data rows intotsv file using panadas
    df = pd.DataFrame(rows, columns=column_list) #list of rows is converted into panads DataFrame
    df.to_csv(path, sep="\t", index=False) #DataFrame is written into tsv

#Setup paths
repo_path = "FASDH25-portfolio2" #define path to repository
folder = "FASDH25-portfolio2/articles" #defining folder where articles are present
gazetteer_path = "FASDH25-portfolio2/gazetteers/geonames_gaza_selection.tsv" #define path and load gazetteer from the tsv file having place names and alternate names

#Read Gazetteer File
with open(gazetteer_path, encoding="utf-8") as file:
    data = file.read()

patterns = {} #an empty dictionary is created for each place names and a count for matches
rows = data.split("\n") #split gazetteer data in new lines to get eeach row on file

for row in rows[1:]:  # Skip header because the pattern starts from next row
    columns = row.split("\t") #seprates each column in tsv by tabs
    asciiname = columns[0] #ensures that first column has names for the place
    name_variants = [asciiname] #lists name variants consisting of the standard name
    alternate_names = columns[5].strip() #gets the alternate names from the 6th column which is counted as the 5th column, if present

    if alternate_names:
      alternate_list = alternate_names.split(",") #splits the alternate names with comma and gets list of other names
      for name in alternate_list: #loops through each alternate name in the list
        name_variants.append(name.strip()) #.strip will remove whitespace from alternate names and .append will add alternate names to the list if prenest

# build a regex pattern that will work to find all names and match diffrent varianats of the place names aswell
    regex_pattern = "|".join (re.escape(name) for name in name_variants) #using re.escape to escape any special characters in place names and "|" is used for alternation
    patterns[asciiname] = {"pattern": regex_pattern, "count": 0} #includes all names and their variants with numbers

mentions_per_month = {} #dictionary stores how many times each name was mentioned
war_start_date = "2023-10-07" #set the date to filt

for filename in os.listdir(folder):
    date_str = filename.split("_")[0]
    if date_str < war_start_date:
        continue

    file_path = f"{folder}/{filename}"
    with open(file_path, encoding="utf-8") as file:
        text = file.read()

    for place in patterns:
        pattern = patterns[place]["pattern"]
        matches = re.findall(pattern, text, re.IGNORECASE)
        count = len(matches)
        patterns[place]["count"] += count

        month_str = date_str[:7]

        if place not in mentions_per_month:
            mentions_per_month[place] = {}
        if month_str not in mentions_per_month[place]:
            mentions_per_month[place][month_str] = 0

        mentions_per_month[place][month_str] += count

#Print Output
for place in mentions_per_month:
    print(f'"{place}": {{')
    month_list = list(mentions_per_month[place].keys())
    for month in month_list:
        count = mentions_per_month[place][month]
        if month != month_list[-1]:
            print(f'    "{month}": {count},')
        else:
            print(f'    "{month}": {count}')
    print("},")

#Prepare Rows and Write TSV
output_rows = []
for place in mentions_per_month:
    for month in mentions_per_month[place]:
        count = mentions_per_month[place][month]
        output_rows.append((place, month, count))

write_tsv(output_rows, ["place", "month", "count"], "regex_counts.tsv")

"Jabalya": {
    "2023-10": 37,
    "2024-02": 7,
    "2023-12": 46,
    "2024-01": 12,
    "2023-11": 103,
    "2024-04": 3,
    "2024-03": 8
},
"Bayt Lahya": {
    "2023-10": 11,
    "2024-02": 3,
    "2023-12": 8,
    "2024-01": 2,
    "2023-11": 14,
    "2024-04": 2,
    "2024-03": 1
},
"Jabalia": {
    "2023-10": 33,
    "2024-02": 7,
    "2023-12": 44,
    "2024-01": 12,
    "2023-11": 102,
    "2024-04": 2,
    "2024-03": 8
},
"Bayt Hanun": {
    "2023-10": 19,
    "2024-02": 3,
    "2023-12": 4,
    "2024-01": 1,
    "2023-11": 17,
    "2024-04": 10,
    "2024-03": 7
},
"Khan Younis": {
    "2023-10": 74,
    "2024-02": 63,
    "2023-12": 173,
    "2024-01": 74,
    "2023-11": 132,
    "2024-04": 25,
    "2024-03": 40
},
"An Nusayrat": {
    "2023-10": 22,
    "2024-02": 6,
    "2023-12": 26,
    "2024-01": 11,
    "2023-11": 16,
    "2024-04": 8,
    "2024-03": 9
},
"Al Burayj": {
    "2023-10": 3,
    "2024-02": 2,
    "2023-12": 17,
    "2024-01": 5,
    "2023-11": 15,
    "

Ex 2B

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
import stanza
# Download the language model:
stanza.download("en")

# Create the NLP pipeline for tokenization,multi-word token expansion and named entity recognition and specifying the language:
nlp = stanza.Pipeline(lang="en", processors='tokenize,mwt,ner')

#import additional necessary libraries
import os
import pandas as pd
import re



#setting path to folder that contains required text articles
folder = "/content/FASDH25-portfolio2/articles"

#empty variable for jan 2024 articles
jan_articles=0
# Dictionary to store place name counts
place_counts = {}
# loop through the entities in the folder:
for filename in os.listdir(folder):
  # loop through files from Jan 2024 make sure that the articles end with .txt
  if "2024-01" in filename:
    jan_articles += 1
    # make path
    path = os.path.join(folder, filename)
    #open and read files
    with open(path, encoding="utf-8") as file:
        text = file.read()
#Using the pipeline for named entities focusing on Geopolitical entities and place based entities
# 1) Chatgpt corrected code, varable consistency
    doc = nlp(text)
    for sentence in doc.sentences:
      for entity in sentence.ents:
        #specifying the type of named entites we want (geographic/place-based entities)
        if entity.type in ["GPE", "LOC"]:
          place = entity.text.strip()
          # Increment the count for each place
          place_counts[place] = place_counts.get(place, 0) + 1

#cleaning the named entities
# 2) Help taken from Chatgpt to fix error
# Dictionary to store cleaned and normalized place names with counts
clean_counts = {}

for place, count in place_counts.items():
    #Removes possessive endings like 's
    place = re.sub(r"['`]s\b", "", place)
    #Removes punctuation from entites
    place = re.sub(r"[^\w\s]", "", place)
    #removes "the" from entities
    place = re.sub(r"^the\s+", "", place, flags=re.IGNORECASE)
    # Combine counts for places with equivalent cleaned names
    clean_counts[place] = clean_counts.get(place, 0) + count

# printing normalized place names with their total counts
print(clean_counts)

filename = "ner_counts.tsv"
# write results to a TSV file with columns "place" and "count"
with open("ner_counts.tsv", mode= "w", encoding= "utf-8") as file:
  # create a header of the tsv files:
  header = "place\tcount\n"
  file.write(header)
  # loop through the places dictionary, creating a row for all items in the dictionary
  for place, count in clean_counts.items():
    row = f"{place}\t{count}\n"
    file.write(row)

#open file and print normalised results
with open("/content/ner_counts.tsv", encoding="utf-8") as file:
  print(file.read())


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


{'Morocco': 13, 'Israel': 1594, 'Gaza': 1605, 'Rabat': 3, 'United States': 160, 'United Arab Emirates': 14, 'UAE': 7, 'Bahrain': 11, 'Sudan': 3, 'US': 717, 'Western Sahara': 4, 'Washington': 60, 'Tel Aviv': 49, 'Algeria': 7, 'Marrakesh': 1, 'Moroccos': 1, 'Maghreb': 1, 'Ukraine': 47, 'Saudi Arabia': 39, 'California': 3, 'West Bank': 160, 'Dena': 1, 'Israels': 31, 'Oakland': 1, 'South Africa': 200, 'Jordan': 42, 'Jerusalem': 26, 'East Jerusalem': 23, 'Egypt': 43, 'Qatar': 64, 'Kuala Lumpur': 4, 'Malaysia': 8, 'Palestine': 124, 'Indonesias': 1, 'Jakarta': 2, 'Johannesburg': 4, 'London': 17, 'Paris': 8, 'Vienna': 1, 'Berlin': 5, 'Amman': 6, 'Washington DC': 7, 'UK': 95, 'Manchester': 1, 'Yemen': 182, 'India': 50, 'Hyderabad': 1, 'Colombos Kollupitiya': 1, 'Namibia': 10, 'Germany': 31, 'Palestinian Territories': 1, 'Sweden': 3, 'Iran': 206, 'Kerman': 6, 'Lebanon': 175, 'Bethlehem': 4, 'Nairoukh': 1, 'China': 28, 'Italy': 10, 'Spain': 7, 'Turkey': 25, 'Shawawra': 1, 'Hague': 39, 'South Afri

Ex 3

In [None]:
!pip install stanza



In [None]:
import requests
import time

geonames_username = "zainab128"

def get_coordinates(place, username=geonames_username, fuzzy=0, timeout=1):

  # wait a short while, so that we don't overload the server:
  time.sleep(timeout)
  # make the API call:
  url = "http://api.geonames.org/searchJSON?"
  params = {"q": place, "username": username, "fuzzy": fuzzy, "maxRows": 1, "isNameRequired": True}
  response = requests.get(url, params=params)
  # convert the response into a dictionary:
  results = response.json()
  print(results)
  # get the first result:
  try:
    result = results["geonames"][0]
    return {"latitude": result["lat"], "longitude": result["lng"]}
  except (IndexError, KeyError):
    print("No results found for your API call", response.request.url)

import csv

filename = "ner_gazetteer.tsv"

with open(filename, mode="w", encoding="utf-8") as file:
  header = "place\tlatitude\tlongitude\n"
  file.write(header)
  with open("ner_counts.tsv", encoding="utf-8") as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
      place_name = row['place']
      coordinates = get_coordinates(place_name)
      if coordinates:
        latitude = coordinates["latitude"]
        longitude = coordinates["longitude"]
        row_data = f"{place_name}\t{latitude}\t{longitude}\n"
        file.write(row_data)
      else:
           row_data = f"{place_name}\tNA\tNA\n"
           file.write(row_data)


{'totalResultsCount': 10, 'geonames': [{'adminCode1': '00', 'lng': '-10', 'geonameId': 2542007, 'toponymName': 'Kingdom of Morocco', 'countryId': '2542007', 'fcl': 'A', 'population': 36029138, 'countryCode': 'MA', 'name': 'Morocco', 'fclName': 'country, state, region,...', 'countryName': 'Morocco', 'fcodeName': 'independent political entity', 'adminName1': '', 'lat': '28.5', 'fcode': 'PCLI'}]}
{'totalResultsCount': 33, 'geonames': [{'adminCode1': '00', 'lng': '34.75', 'geonameId': 294640, 'toponymName': 'State of Israel', 'countryId': '294640', 'fcl': 'A', 'population': 8883800, 'countryCode': 'IL', 'name': 'Israel', 'fclName': 'country, state, region,...', 'countryName': 'Israel', 'fcodeName': 'independent political entity', 'adminName1': '', 'lat': '31.5', 'fcode': 'PCLI'}]}
{'totalResultsCount': 40, 'geonames': [{'adminCode1': 'GZ', 'lng': '34.46672', 'geonameId': 281133, 'toponymName': 'Gaza', 'countryId': '6254930', 'fcl': 'P', 'population': 410000, 'countryCode': 'PS', 'name': 'G

Ex 4a

In [None]:
# Importing the necessary library
import plotly.express as px
import pandas as pd

# Load frequency data
freq_df = pd.read_csv("regex_counts.tsv", sep="\t")

# Load geocoded coordinates
geo_df = pd.read_csv("ner_gazetteer.tsv", sep="\t")


# merge the two tables, using the common column “place”
merged_df = pd.merge(geo_df, freq_df, on="place")


# Plotly animated geo map
fig = px.scatter_geo(
    merged_df,
    lat="latitude",
    lon="longitude",
    size="count",
    color="place",
    hover_name="place",
    animation_frame="month",
    projection="natural earth",
    size_max=20
)
# Save interactive HTML map
fig.write_html("regex_map.html")
# Show the figure
fig.show()

Ex 4b

In [14]:
# Importing the necessary library
import plotly.express as px
import pandas as pd

# Load NER frequency data (January 2024 only)
ner_df = pd.read_csv("ner_counts.tsv", sep="\t")

# Load geocoded coordinates
geo_df = pd.read_csv("ner_gazetteer.tsv", sep="\t")

# Merge NER data with coordinates
merged_df = pd.merge(ner_df, geo_df, on="place")

# Drop rows where coordinates or counts are missing
merged_df = merged_df.dropna(subset=["latitude", "longitude", "count"])

# Create static and interactive maps
fig = px.scatter_geo(
    merged_df,
    lat="latitude",
    lon="longitude",
    size="count",
    color="place",
    hover_name="place",
    projection="natural earth",
    size_max=20,
    title="NER Place Frequencies - January 2024"
)

# Save interactive HTML map
fig.write_html("ner_map.html")

# Show the figure
fig.show()