In [6]:
import json
import pandas as pd
from thefuzz import fuzz, process

# Load data
with open("Data/all-combined-dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Get list of unique perpetrator names
names = df["perpetrator"].dropna().unique()

# Compare each name to others using fuzzy matching
matches = []
seen = set()

for i, name in enumerate(names):
    if name in seen:
        continue
    for j in range(i + 1, len(names)):
        other = names[j]
        score = fuzz.token_sort_ratio(name, other)
        if score > 85:  # Threshold (tweak as needed)
            matches.append((name, other, score))
            seen.add(other)

# Display results
for a, b, score in matches:
    print(f"{a} <--> {b} (Similarity: {score})")


David Brian Stone Jr. <--> David Brian Stone Sr. (Similarity: 95)
Brandon Clint Russell (1) <--> Brandon Clint Russell (2) (Similarity: 96)
Jeffrey Raphiel Clark, Jr. <--> Jeffrey Raphiel Clark Jr. (Similarity: 100)
William Garfield Bilbrough IV (1) <--> William Garfield Bilbrough IV (2) (Similarity: 97)
Patrik Jordan Mathews (2) <--> Patrik Jordan Mathews (1) (Similarity: 96)
Patrik Jordan Mathews (2) <--> Patrik Jordan Mathews (3) (Similarity: 96)
Brian Mark Lemley Jr. (2) <--> Brian Mark Lemley Jr. (1) (Similarity: 95)
Brian Mark Lemley Jr. (2) <--> Brian Mark Lemley Jr. (3) (Similarity: 95)
Steven Carrillo (2) <--> Steven Carrillo (1) (Similarity: 94)
Steven Carrillo (2) <--> Steven Carillo (Similarity: 90)
Robert Alvin Justus, Jr. <--> Robert Alvin Justus Jr. (Similarity: 100)
Justen Michael Watkins (1) <--> Justen Michael Watkins (2) (Similarity: 96)
Michael Allen Jones <--> Michael Alan Jones (Similarity: 92)
Hatchet M. Speed <--> Hatchet M . Speed (2) (Similarity: 94)


In [4]:
from thefuzz import fuzz, process

In [7]:
import json
import pandas as pd
from datetime import datetime

# Load data
with open("Data/all-combined-dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

def normalize_date(row):
    raw_date = row["date"]
    source = row["source"]
    
    try:
        if source in ("lone_actor", "manual_entry"):
            # Interpret as DD/MM/YYYY
            parsed = datetime.strptime(raw_date, "%d/%m/%Y")
        elif source == "accelerationism":
            # Interpret as MM/DD/YYYY
            parsed = datetime.strptime(raw_date, "%m/%d/%Y")
        else:
            return raw_date  # Leave untouched if source unknown
        
        return parsed.strftime("%Y-%m-%d")  # Convert to ISO format
    except Exception:
        return raw_date  # Keep original if parsing fails

# Apply normalization
df["date"] = df.apply(normalize_date, axis=1)

# Save cleaned version
df.to_json("dates-normalized.json", orient="records", indent=2)

print("Dates have been normalized to YYYY-MM-DD format and saved to 'dates-normalized.json'")


Dates have been normalized to YYYY-MM-DD format and saved to 'dates-normalized.json'
