In [32]:
import pandas as pd
import re
import os
import pandas as pd
import tqdm.notebook as tqdm
import os
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
from io import BytesIO

In [20]:
# Read the CAMEO codes from the text file
cameo = pd.read_csv("cameo.txt", delimiter=':', header=None)

cameo.columns = ['EventType', 'EventDesc']
cameo['EventDesc'] = cameo['EventDesc'].str.strip()
cameo

Unnamed: 0,EventType,EventDesc
0,1,Make public statement
1,10,Make statement
2,11,Decline comment
3,12,Make pessimistic comment
4,13,Make optimistic comment
...,...,...
311,202,Engage in mass killings
312,203,Engage in ethnic cleansing
313,204,Use weapons of mass destruction
314,2041,"Use chemical, biological, or radiological weapons"


SELECT
  MonthYear,
  EventCode,
  QuadClass,
  GoldsteinScale,
  AvgTone,
  ActionGeo_CountryCode,
  Actor1Geo_CountryCode,
  Actor1Geo_ADM1Code,
  SOURCEURL
FROM
  `gdelt-bq.full.events`
WHERE
  (Actor1Geo_CountryCode = "US" OR ActionGeo_CountryCode = "US")
  AND MonthYear >= 201501
  AND MonthYear <= 202106


In [None]:
# Set your destination folder
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

# GDELT events archive URL
base_url = "http://data.gdeltproject.org/events/"

# Step 1: Get the list of ZIP files
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Step 2: Filter links that end with .zip
zip_links = [
    a['href']
    for a in soup.find_all('a', href=True)
    if a['href'].endswith('.zip') and a['href'].startswith('2')
]

# OPTIONAL: limit number of files to download (for testing)
zip_links = zip_links[:1]  # e.g., first 10 files

# Step 3: Download and extract each ZIP file
for link in zip_links:
    zip_url = base_url + link
    print(f'Downloading: {zip_url}')
    r = requests.get(zip_url)
    
    if r.status_code == 200:
        with ZipFile(BytesIO(r.content)) as z:
            # Assume there's only 1 CSV file per zip
            for file_name in z.namelist():
                print(f'  Extracting: {file_name}')
                z.extract(file_name, data_dir)
    else:
        print(f'  Failed to download: {zip_url} (Status code {r.status_code})')

Downloading: http://data.gdeltproject.org/events/20250513.export.CSV.zip
  Extracting: 20250513.export.CSV


In [None]:
def is_article_id(part):
    # Check if the part starts with 'article_' followed by a UUID and optional .html extension
    article_uuid_pattern = re.compile(
        r'^article_'
        r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
        r'(\.html)?$',
        re.IGNORECASE
    )
    return bool(article_uuid_pattern.match(part))

def extract_title_from_url(url):
    # Split the URL into parts, stripping any leading/trailing slashes
    parts = url.strip('/').split('/')
    # Iterate from the end to find the first article ID part
    for i in tqdm.tqdm(reversed(range(len(parts)))):
        part = parts[i]
        if is_article_id(part):
            # Found an article ID, check if there's a previous part for the title
            if i > 0:
                title_part = parts[i-1]
                return title_part.replace('-', ' ').strip()
            else:
                continue  # No previous part, keep looking
    # If no article ID found, take the last part and remove any file extension
    last_part = parts[-1]
    if '.' in last_part:
        last_part = last_part.split('.')[0]
    return last_part.replace('-', ' ').strip()

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

df_all = pd.DataFrame()
# column_dict = {
#     0: 'event_id',
#     1: 'SQLDATE',
#     2: 'MonthYear',
#     3: 'Year',
#     4: 'FractionDate',
#     5: 'Actor1Code',
#     6: 'Actor1Name',
#     7: 'Actor1CountryCode',
#     8: 'Actor1KnownGroupCode',
#     9: 'Actor1EthnicCode',
#     10: 'Actor1Religion1Code',
#     11: 'Actor1Religion2Code',
#     12: 'Actor1Type1Code',
#     13: 'Actor1Type2Code',
#     14: 'Actor1Type3Code',
#     15: 'Actor2Code',
#     16: 'Actor2Name',
#     17: 'Actor2CountryCode',
#     18: 'Actor2KnownGroupCode',
#     19: 'Actor2EthnicCode',
#     20: 'Actor2Religion1Code',
#     21: 'Actor2Religion2Code',
#     22: 'Actor2Type1Code',
#     23: 'Actor2Type2Code',
#     24: 'Actor2Type3Code',
#     25: 'IsRootEvent',
#     26: 'EventCode',
#     27: 'EventBaseCode',
#     28: 'EventRootCode',
#     29: 'QuadClass',
#     30: 'GoldsteinScale',
#     31: 'NumMentions',
#     32: 'NumSources',
#     33: 'NumArticles',
#     34: 'AvgTone',
#     35: 'Actor1Geo_Type',
#     36: 'Actor1Geo_FullName',
#     37: 'Actor1Geo_CountryCode',
#     38: 'Actor1Geo_ADM1Code',
#     39: 'Actor1Geo_Lat',
#     40: 'Actor1Geo_Long',
#     41: 'Actor1Geo_FeatureID',
#     42: 'Actor2Geo_Type',
#     43: 'Actor2Geo_FullName',
#     44: 'Actor2Geo_CountryCode',
#     45: 'Actor2Geo_ADM1Code',
#     46: 'Actor2Geo_Lat',
#     47: 'Actor2Geo_Long',
#     48: 'Actor2Geo_FeatureID',
#     49: 'ActionGeo_Type',
#     50: 'ActionGeo_FullName',
#     51: 'ActionGeo_CountryCode',
#     52: 'ActionGeo_ADM1Code',
#     53: 'ActionGeo_Lat',
#     54: 'ActionGeo_Long',
#     55: 'ActionGeo_FeatureID',
#     56: 'date',
#     57: 'url',
# }

# Iterate over CSV files, format the DF, and extract title from URL
for csv_file in csv_files:
    file_path = os.path.join(data_dir, csv_file)
    print(f'Importing {csv_file}')
    
    df = pd.read_csv(file_path, delimiter=',', low_memory=False)
    #df = df.rename(columns=column_dict)
    #df = df[column_dict.values()]

    if 'SOURCEURL' in df.columns:  
        df['url_title'] = df['SOURCEURL'].apply(extract_title_from_url)
    else:
        print(f"Warning: 'url' not found for file {csv_file}")
    
    # Append results to the df_all DataFrame
    df_all = pd.concat([df_all, df], ignore_index=True)

# Clean the data by removing duplicates
df_all_clean = df_all.drop_duplicates(subset=['MonthYear', 'url_title']).copy()

# Map the EventCode column in df_all_clean to get the corresponding event descriptions
df_all_clean['EventCode'] = df_all_clean['EventCode'].astype(int)
cameo_dict = dict(zip(cameo['EventType'], cameo['EventDesc']))
df_all_clean['EventDesc'] = df_all_clean['EventCode'].map(cameo_dict)

print('---------------------------------')
print(f"Total rows after cleaning: {len(df_all_clean)}")

df_all_clean

  0%|          | 0/1 [00:00<?, ?it/s]

Importing bq-202106.csv
---------------------------------
Total rows after cleaning: 270864


Unnamed: 0,MonthYear,EventCode,QuadClass,GoldsteinScale,AvgTone,ActionGeo_CountryCode,Actor1Geo_CountryCode,Actor1Geo_ADM1Code,SOURCEURL,url_title,EventDesc
0,202106,831,2,5.0,3.612479,US,US,USAR,https://www.kasu.org/2022-06-12/arkansas-state...,arkansas state university chancellor finalists...,Accede to demands for change in leadership
1,202106,72,2,8.3,-1.870324,FR,US,US,https://newsrnd.com/news/2022-06-12-sipri-annu...,2022 06 12 sipri annual report nuclear powers...,Provide military aid
2,202106,194,4,-10.0,0.813008,US,US,US,https://govmatters.tv/us-army-military-trainin...,us army military training technology weapons s...,Fight with artillery and tanks
3,202106,81,2,5.0,-2.711324,US,US,USIL,https://www.breitbart.com/local/2022/06/11/pol...,police kim foxxs husband alleges she slapped h...,Ease administrative sanctions
5,202106,138,3,-7.0,-6.174558,US,SY,SY13,https://www.jpost.com/middle-east/iran-news/ar...,article 709189,Threaten with military force
...,...,...,...,...,...,...,...,...,...,...,...
1189169,202106,193,4,-10.0,-8.449074,UK,US,US,https://www.sfgate.com/news/article/Second-tee...,Second teenager charged over London shooting o...,Fight with small arms and light weapons
1189187,202106,193,4,-10.0,1.556553,US,US,USDC,https://www.washingtontimes.com/news/2021/jun/...,billionaire jeff bezos will ride on the first ...,Fight with small arms and light weapons
1189203,202106,193,4,-10.0,2.487562,US,US,USDC,https://www.urdupoint.com/en/world/us-governme...,us government extends shelf life of jj coron 1...,Fight with small arms and light weapons
1189206,202106,193,4,-10.0,-4.034235,US,US,USDC,https://www.europesun.com/news/269862657/china...,china support for myanmar military govt grows,Fight with small arms and light weapons


In [None]:
# Save the cleaned data to a CSV file
df_all_clean.to_csv('cleaned_data.csv', index=False)

In [26]:
# Test on the first 10 rows
df_all_clean_mini = df_all_clean.head(10).copy()
df_all_clean_mini.shape

(10, 60)

In [28]:
from newsfeed.utils import fulltext as ft
import time
import tqdm.notebook as tqdm
import sys
import contextlib
import signal

def timeout_handler(signum, frame):
    raise TimeoutError("Request timed out")

# Create empty lists to store titles and full_text
titles = []
full_texts = []
successful_downloads = 0
failed_downloads = 0

# Get total number of entries
total_entries = len(df_all_clean)

# Loop through each row in df_all_clean with improved progress tracking
for index, row in tqdm.tqdm(df_all_clean.iterrows(), total=total_entries, leave=True):
    with contextlib.redirect_stdout(None):
        url = row['url']
        start_time = time.time()  # Define start_time for each iteration
        
        try:
            # Set timeout alarm
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(15)  # 15 seconds timeout
            
            # Download and parse article
            article = ft.download(url=url)
            article.download()
            article.parse()
            
            # Clear the alarm
            signal.alarm(0)
            
            # Extract title and full text
            titles.append(article.title)
            full_texts.append(article.text)
            successful_downloads += 1
            
        except Exception as e:
            print(f"Error processing URL: {e}")
            # Add placeholder values for failures
            titles.append(None)
            full_texts.append(None)
            failed_downloads += 1
        
    # Add a small delay to avoid overloading servers
    time.sleep(0.5)
    # Check if this iteration took too long, but continue to next article rather than breaking
    if time.time() - start_time > 15:
        print(f"Timeout reached for URL: {url}")


# Add the new columns to df_all_clean
df_all_clean['title'] = titles
df_all_clean['full_text'] = full_texts

# Print summary statistics
print(f"Download complete: {successful_downloads} successful, {failed_downloads} failed out of {total_entries} articles")

df_all_clean

  0%|          | 0/28454 [00:00<?, ?it/s]

KeyboardInterrupt: 

# === Global Terrorism Dataset (Target Variable) ===

In [None]:
# Load the Global Terrorism Dataset
attacks1 = pd.read_excel('terrorismdb_2020.xlsx')
attacks2 = pd.read_excel('terrorismdb_2021.xlsx')

In [None]:
# Filter attacks1 for year 2020 and country code 217
attacks1_filtered = attacks1[attacks1['iyear'] == 2020]
attacks1_filtered = attacks1_filtered[attacks1_filtered['country'] == 217]
attacks2_filtered = attacks2[attacks2['country'] == 217]

# Concatenate the filtered datasets
attacks = pd.concat([attacks1_filtered, attacks2_filtered], ignore_index=True)
attacks = attacks[['iyear', 'imonth', 'country', 'provstate', 'targtype1_txt']].copy()

# Clean up to free memory
del attacks1, attacks2, attacks1_filtered, attacks2_filtered

# Create a mapping of state names to ADM1 codes
state_to_code = {
    'Alabama': 'USAL',
    'Alaska': 'USAK',
    'Arizona': 'USAZ',
    'Arkansas': 'USAR',
    'California': 'USCA',
    'Colorado': 'USCO',
    'Connecticut': 'USCT',
    'Delaware': 'USDE',
    'District of Columbia': 'USDC',
    'Florida': 'USFL',
    'Georgia': 'USGA',
    'Hawaii': 'USHI',
    'Idaho': 'USID',
    'Illinois': 'USIL',
    'Indiana': 'USIN',
    'Iowa': 'USIA',
    'Kansas': 'USKS',
    'Kentucky': 'USKY',
    'Louisiana': 'USLA',
    'Maine': 'USME',
    'Maryland': 'USMD',
    'Massachusetts': 'USMA',
    'Michigan': 'USMI',
    'Minnesota': 'USMN',
    'Mississippi': 'USMS',
    'Missouri': 'USMO',
    'Montana': 'USMT',
    'Nebraska': 'USNE',
    'Nevada': 'USNV',
    'New Hampshire': 'USNH',
    'New Jersey': 'USNJ',
    'New Mexico': 'USNM',
    'New York': 'USNY',
    'North Carolina': 'USNC',
    'North Dakota': 'USND',
    'Ohio': 'USOH',
    'Oklahoma': 'USOK',
    'Oregon': 'USOR',
    'Pennsylvania': 'USPA',
    'Rhode Island': 'USRI',
    'South Carolina': 'USSC',
    'South Dakota': 'USSD',
    'Tennessee': 'USTN',
    'Texas': 'USTX',
    'Utah': 'USUT',
    'Vermont': 'USVT',
    'Virginia': 'USVA',
    'Washington': 'USWA',
    'West Virginia': 'USWV',
    'Wisconsin': 'USWI',
    'Wyoming': 'USWY',
    'Puerto Rico': 'USPR'
}

# Apply the mapping to create the new column
attacks['Actor1Geo_ADM1Code'] = attacks['provstate'].map(state_to_code)

attacks

Unnamed: 0,iyear,imonth,country,provstate,targtype1_txt,Actor1Geo_ADM1Code
0,2020,1,217,Delaware,Abortion Related,USDE
1,2020,1,217,Florida,Private Citizens & Property,USFL
2,2020,1,217,California,Religious Figures/Institutions,USCA
3,2020,1,217,Tennessee,Telecommunication,USTN
4,2020,1,217,California,Religious Figures/Institutions,USCA
...,...,...,...,...,...,...
124,2021,5,217,New York,Private Citizens & Property,USNY
125,2021,5,217,Maryland,Police,USMD
126,2021,5,217,Tennessee,Government (General),USTN
127,2021,6,217,Colorado,Police,USCO


transform this dataframe (attacks) to have the structure: columns should be all the ADM1 codes of states. Rows should be 1 if 

In [68]:
# Group the data by state code and month/year, then count occurrences
attack_counts = attacks.groupby(['Actor1Geo_ADM1Code', 'iyear', 'imonth']).size().reset_index(name='attack_count')

pivot_attacks = attack_counts.pivot_table(
    index=['iyear', 'imonth'], 
    columns='Actor1Geo_ADM1Code', 
    values='attack_count',
    fill_value=0
)

pivot_attacks = pivot_attacks.sort_index()

pivot_attacks.columns.name = None
pivot_attacks.index.names = ['Year', 'Month']
pivot_attacks

Unnamed: 0_level_0,Unnamed: 1_level_0,USAL,USAR,USAZ,USCA,USCO,USCT,USDC,USDE,USFL,USGA,...,USNY,USOH,USOR,USPA,USSC,USTN,USTX,USVA,USWA,USWI
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020,1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2020,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2020,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2020,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2020,5,0.0,0.0,1.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2020,6,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
2020,7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
2020,8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2020,9,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2020,10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [69]:
pivot_attacks.to_csv('attacks_by_state.csv')