In [11]:
import os
import re
import nltk
import time
import spacy
import random
import dateparser
from datetime import datetime
from nltk import sent_tokenize
from striprtf.striprtf import rtf_to_text

nlp = spacy.load("en_core_web_sm")  

In [None]:
# download nltk resources
# nltk.download('punkt', download_dir='/Users/andreaparra/Virtual_envs/python3.12/lib/nltk_data')
# nltk.download('maxent_ne_chunker', download_dir='/Users/andreaparra/Virtual_envs/python3.12/lib/nltk_data')
# nltk.download('words', download_dir='/Users/andreaparra/Virtual_envs/python3.12/lib/nltk_data')
# nltk.download('averaged_perceptron_tagger_eng', download_dir='/Users/andreaparra/Virtual_envs/python3.12/lib/nltk_data')
# nltk.download('maxent_ne_chunker_tab',download_dir='/Users/andreaparra/Virtual_envs/python3.12/lib/nltk_data')


### Testing Code

In [None]:
test_file = "data/nexis uni/Files(486)/No Headline In Original(48).RTF"

with open(test_file, "r", encoding="utf-8") as f:
    rtf_content = f.read()
    plain_text = rtf_to_text(rtf_content)
    
    pattern = re.compile(HIGH_RECALL_DATE_PATTERN, re.IGNORECASE | re.VERBOSE)
    date_matches = pattern.findall(plain_text)
    print(plain_text)
    test = extract_raid_dates(date_matches, "2020-01-01")
    print(date_matches)
    print(test)

In [4]:
# frist lets sue the regex approach to extract phrases around mention of raids: 

# strict pattern
pat1 = re.compile(
    r"\bimmigration(?:\s+\w+){0,3}\s+raid(?:ed)?\b",
    re.IGNORECASE
)

# high recall
pat2 = re.compile(
    r"(?i)\b(raid(?:ed)?)\b.{0,80}\b(immigration|ICE|undocumented|detain|arrest|federal agents|workers|farmworkers)\b"
    r"|"
    r"\b(immigration|ICE|undocumented|detain|arrest|federal agents|workers|farmworkers)\b.{0,80}\b(raid(?:ed)?)\b"
)



patterns = [pat1, pat2]

def get_raid_sentences(text):
    sentences = sent_tokenize(text)
    raid_sents = []
    for s in sentences:
        if any(p.search(s) for p in patterns):
            raid_sents.append(s)
    return raid_sents

In [5]:
# regex for Month day, year (e.g., "September 8, 2025")
DATE_PATTERN = re.compile(
    r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}"
)

HIGH_RECALL_DATE_PATTERN = r"""
\b
(
  # Month-name formats: September 8, 2025
  (?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*
  [\s.-]+
  \d{1,2}(?:st|nd|rd|th)?
  (?:,\s*|\s+)
  \d{4}
  |
  # Day Month-name formats: 8 September 2025
  \d{1,2}(?:st|nd|rd|th)?
  [\s.-]+
  (?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*
  (?:,\s*|\s+)
  \d{4}
  |
  # ISO-like numeric: 2025-09-08 or 2025/9/8
  \d{4}[-/]\d{1,2}[-/]\d{1,2}
  |
  # Generic numeric: 09/08/2025, 9.8.25, etc.
  \d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}
)
\b
"""

# TODO: include this in processing 
def resolve_date(expr, pub_date):
    # uses publication date as reference to get an absolute date

    dt = dateparser.parse(
        expr,
        settings={
            "RELATIVE_BASE": datetime(pub_date.year, pub_date.month, pub_date.day),
            "PREFER_DATES_FROM": "past",
        },
    )
    return dt.date() if dt else None

def extract_raid_dates(dates, pub_date_str):
    """
    text: full article text (plain text)
    pub_date_str: e.g. 'September 12, 2025'
    returns: list of {'expr': ..., 'date': ...}
    """
    pub_date = dateparser.parse(pub_date_str).date()
    

    results = []
    for date_str in dates:
        abs_date = resolve_date(date_str, pub_date)
        if abs_date:
            results.append({"expr": date_str, "date": abs_date.isoformat()})
            

        # Look into using this bit of code to get even more info out. 
        # for token in doc:
        #     if token.text.lower() in ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"] \
        #        and token.ent_type_ != "DATE":
        #         expr = token.text
        #         abs_date = resolve_date(expr, pub_date)
        #         if abs_date:
        #             results.append({"expr": expr, "date": abs_date.isoformat(), "sentence": s})

    # deduplicate by date
    seen = set()
    unique = []
    for r in results:
        key = (r["date"], r["expr"])
        if key not in seen:
            seen.add(key)
            unique.append(r)
            
    return unique


In [6]:
#Geopolitical Entity, location and faciliyt
LOCATION_LABELS = {"GPE", "LOC", "FAC"} 


# this fucniton I didn't really make, it uses spacy to extract 
# locations in sentences that mention 'raid'
def extract_locations_from_doc(doc):
    locations = set()

    for sent in doc.sents:
        # only focus on sentences that include 'raid'
        if not any(tok.lemma_.lower() == "raid" for tok in sent):
            continue

        # 1) NER-based locations in this sentence
        for ent in sent.ents:
            if ent.label_ in LOCATION_LABELS:
                locations.add(ent.text.strip())

        # 2) Dependency-based: raid + prepositions (in/at/near/on)
        for tok in sent:
            if tok.lemma_.lower() == "raid":
                for child in tok.children:
                    if child.dep_ == "prep" and child.text.lower() in {"in", "at", "near", "on"}:
                        pobj = next((c for c in child.children if c.dep_ == "pobj"), None)
                        if pobj is not None:
                            # collect the full span of the pobj subtree
                            subtree_tokens = list(pobj.subtree)
                            span_text = doc[subtree_tokens[0].i : subtree_tokens[-1].i + 1].text
                            locations.add(span_text.strip())

    return list(locations)


In [7]:
ARREST_LEMMAS = {"arrest", "detain"}
PEOPLE_NOUNS = {"people", "persons", "workers", "employees",
                "immigrants", "farmworkers", "detainees", "individuals"}

APPROX_WORDS = {"dozens", "scores", "hundreds", "thousands"}

def parse_int_safe(text):
    # remove commas and try to cast
    try:
        return int(text.replace(",", ""))
    except ValueError:
        return None

def extract_arrest_counts_from_doc(doc):
    results = []

    for sent in doc.sents:
        # only if sentence mentions arrest/detain/raid-ish things
        if not any(tok.lemma_.lower() in ARREST_LEMMAS for tok in sent):
            continue

        for tok in sent:
            if tok.lemma_.lower() in ARREST_LEMMAS and tok.pos_ == "VERB":
                # 1) passive subject: "360 people were arrested"
                for child in tok.children:
                    if child.dep_ in {"nsubjpass", "dobj", "obj"} and child.lemma_.lower() in PEOPLE_NOUNS:
                        # look for numeric modifiers of that noun
                        for gchild in child.children:
                            if gchild.dep_ == "nummod":
                                val = gchild.text
                                approx = val.lower() in APPROX_WORDS
                                num_val = parse_int_safe(val) if not approx else None
                                results.append({
                                    "value": num_val if num_val is not None else val,
                                    "approx": approx,
                                    "sentence": sent.text.strip()
                                })

                # 2) "arrested about 360 people" – number attached to the verb
                # look for NUM children and then a people noun
                num_child = None
                people_child = None
                for child in tok.children:
                    if child.pos_ == "NUM":
                        num_child = child
                    if child.lemma_.lower() in PEOPLE_NOUNS:
                        people_child = child
                if num_child is not None and people_child is not None:
                    val = num_child.text
                    approx = val.lower() in APPROX_WORDS
                    num_val = parse_int_safe(val) if not approx else None
                    results.append({
                        "value": num_val if num_val is not None else val,
                        "approx": approx,
                        "sentence": sent.text.strip()
                    })

        # 3) phrases like "netted hundreds of arrests"
        for tok in sent:
            if tok.lemma_.lower() == "net" and tok.pos_ == "VERB":
                # look for "hundreds of arrests"
                for child in tok.children:
                    if child.pos_ == "NUM" or child.text.lower() in APPROX_WORDS:
                        # check that "arrests" appears somewhere nearby
                        if any(t.lemma_.lower() == "arrest" and t.pos_ == "NOUN" for t in sent):
                            val = child.text
                            approx = True  # "hundreds" etc.
                            results.append({
                                "value": val,
                                "approx": approx,
                                "sentence": sent.text.strip()
                            })

    return results

In [8]:
def analyze_raid_snippet(text):
    doc = nlp(text)
    locations = extract_locations_from_doc(doc)
    arrests = extract_arrest_counts_from_doc(doc)
    return {
        "locations": locations,
        "arrest_counts": arrests
    }


TODO:

1. Add timer and move on to next file if it gets stuck
2. Write data to file as we go to avoid data loss
3. Figure out way to process resulting data file
    - Consolidate data into specific raids
        - Extract likely date (date distribution?)
        - Geocode
        - Themes(?)
    - Assign value of "virality" of a given raid (how represented it is in the news)
4. Validation
    - Select some articles with and without raids identified and validate manually
    - See if mapped raids correspoond to general trends of ICE arrests/abductions

In [None]:
start_time = time.perf_counter()
# Your code to be timed
time.sleep(1) # Simulate some work
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.4f} seconds")

In [15]:
batch = "96-216"
dir_path = "data/nexis uni/pg" + batch

PROCESS_DATES = True
OUTPUT_FILE = f"data/nexis uni/processed/processed_data_{batch}.txt"


data = []
files_processed = []
errors = []
idx = 0

with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
    
    for file in os.listdir(dir_path):    
        if "doclist.RTF" in file:
            continue
        # only process RTF files
        if ".RTF" in file:
            clean_name = re.sub(r"\(\d+\)\.RTF$", ".RTF", file, flags=re.IGNORECASE)
            if (clean_name in files_processed) and not (clean_name.startswith("No Headline In Original")):
                print("Skipping already processed file:", file)
                continue
            file_path = os.path.join(dir_path, file)
            print(f"Processing file {idx}/{len(os.listdir(dir_path))}: {file}")
            with open(file_path, "r", encoding="utf-8") as f:
                data_record = {}
                data_record["file_name"] = file
                rtf_content = f.read()
                plain_text = rtf_to_text(rtf_content)
                
                lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
                
                # get all dats in the articel (we will then process based on pubdate)
                pattern = re.compile(HIGH_RECALL_DATE_PATTERN, re.IGNORECASE | re.VERBOSE)
                date_matches = pattern.findall(plain_text)
                
                for i, line in enumerate(lines):
                    if line:
                        # stop parsing (based on text structure)
                        if line.startswith("Classification"):
                            break
                        # extract date
                        if line.startswith("Copyright"):
                            # the date should be the line immediately before this
                            if i == 0:
                                date_line = None
                            else:  
                                date_line = lines[i - 1]
                                # extract date pattern
                                match = DATE_PATTERN.search(date_line)
                                if match:
                                    date_str = match.group(0)
                            # recrod date
                            data_record["pub-date"] = date_str
                        else:
                        # assume fist date found is publication date
                            data_record["pub-date"] = date_matches[0] if date_matches else None
                        
                        data_record["raid_dates"] = date_matches
                        if data_record["pub-date"] and PROCESS_DATES:
                            # for later cause it is really slow
                            # raid_dates = extract_raid_dates(date_matches, data_record["pub-date"])
                            # if more than 10 dates in file, shuffle and pick first 10
                            if len(date_matches) > 10:
                                random.shuffle(date_matches)
                                date_matches = date_matches[:10]
                            try:
                                raid_dates = [dateparser.parse(d).date() for d in date_matches if dateparser.parse(d)]
                                data_record["raid_dates"] = raid_dates
                            except AttributeError:
                                errors.append(file)
                            
                        # extract raid sentences
                        raid_sentence = get_raid_sentences(line)
                        if raid_sentence:
                            # add new found sentenses
                            found_sentences = data_record["raid_sentences"] if "raid_sentences" in data_record else []
                            found_sentences.extend(raid_sentence)
                            data_record["raid_sentences"] = found_sentences
                            
                            # analyze each sentence
                            raid_analysis = analyze_raid_snippet(raid_sentence[0])
                            found_raids = data_record["raid_analysis"] if "raid_analysis" in data_record else []
                            found_raids.append(raid_analysis)
                            data_record["raid_analysis"] = found_raids
                # only keep recrods where raids were found
                if "raid_analysis" in data_record:
                    out_f.write(str(data_record) + "\n")
                    data.append(data_record)
                # document all files that were processed
                files_processed.append(file)
                idx += 1

    
data

Processing file 0/495: BAY BUZZ_ POLITICAL NEWS OF TAMPA BAY.RTF
Processing file 1/495: Immigration blitz nets nearly 100 meatworkers FRAUD RACKET.RTF
Processing file 2/495: The Home Secretary is a walking disaster.RTF
Processing file 3/495: Religion News in Brief(2).RTF
Processing file 4/495: Intercepté en pleine mer(2).RTF
Processing file 5/495: MORE TROUBLE FOR SMITH AS MOLE SPARKS MYSTERY OF DOZEN LEAKS.RTF
Processing file 6/495: Record heroin haul made in Britain and SA(2).RTF
Processing file 7/495: Fed_ Eighteen illegal workers found hiding in Vic , WA.RTF
Processing file 8/495: Jurors find former Agriprocessors slaughterhouse manager not guilty of child labour violations(2).RTF
Processing file 9/495: Two Canadians tied to controversial U.S. mosque arrested(2).RTF
Processing file 10/495: Illegal immigrants cultivated cannabis in Cheltenham.RTF
Processing file 11/495: Israel to Deport Aid Ship Activists.RTF
Processing file 12/495: Saturday Review_ ARTS_ Nowhere to hide_ His criti

[{'file_name': 'Religion News in Brief(2).RTF',
  'pub-date': 'March 10, 2010',
  'raid_dates': [datetime.date(2010, 3, 10), datetime.date(2010, 3, 11)],
  'raid_sentences': ['POSTVILLE, Iowa (AP) _ Ten undocumented aliens who were arrested and charged in the May 2008 raid at the former Agriprocessors kosher slaughterhouse in Postville have learned that they will be deported by March 31.'],
  'raid_analysis': [{'locations': ['the former Agriprocessors', 'Postville'],
    'arrest_counts': []}]},
 {'file_name': 'Jurors find former Agriprocessors slaughterhouse manager not guilty of child labour violations(2).RTF',
  'pub-date': 'June 7, 2010',
  'raid_dates': [datetime.date(2010, 6, 7), datetime.date(2010, 6, 17)],
  'raid_sentences': ['Rubashkin had been charged following a May 2008 immigration raid in which 389 workers were arrested.',
   'The alleged violations occurred between September 2007 and May 2008, when the plant was raided by federal immigration agents.'],
  'raid_analysis': 

In [None]:
print(f"Files with errors: {len(errors)}")
print(f"Files processed: {len(files_processed)}")
print(f"Records with raids found: {len(data)}")