In [1]:
import requests
from bs4 import BeautifulSoup
import pyperclip
import pandas as pd
import re
import time
import json
from glob import glob
import ast
from tqdm import tqdm
import os
import random
import string
from tqdm import tqdm
from crewai_tools import ScrapeWebsiteTool
from selenium.webdriver.support.ui import WebDriverWait
import xml.etree.ElementTree as ET
from pandarallel import pandarallel
pandarallel.initialize(nb_workers = 16, progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## MAS data

In [None]:
website = 'https://www.mas.gov.sg'

In [None]:
url = 'https://www.mas.gov.sg/regulation/enforcement/enforcement-actions/?page=1&q=&sort=&rows=All#MasXbeEnforcementActionKeyword'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')


In [None]:
table = soup.find_all('table')

# Extract headers
headers = [header.get_text(strip=True) for header in table[0].find_all('th')]
headers.append('url_link')

# Extract rows
rows = []
for row in table[0].find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    row_data = []
    for cell in cells:
        row_data.append(cell.get_text(strip=True))
        if cell.find('a', href=True) is not None:
            row_data.append(cell.find('a', href=True) ['href'])
    rows.append(row_data)

# Create a pandas DataFrame
df = pd.DataFrame(rows, columns=headers)
df["url_link"] = df["url_link"].apply(lambda x: f"{website}{x}")

In [None]:
def get_all_mass_offenses(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'\n+', '\n', text.strip("\t"))
    return re.sub(r' +', ' ', text.strip("\t")).strip()

In [None]:
df["economic_offense"] = df.parallel_apply(lambda row: get_all_mass_offenses(row['url_link']), axis=1)
df.to_csv("mas_enforcement_actions.csv", index=False)

## UNSC sanctioned list

In [None]:
# Load and parse the XML file
file_path = "consolidated_unsc.xml"  # Change this to your file path
tree = ET.parse(file_path)
root = tree.getroot()

# Extract all individuals
individuals = []
for individual in root.findall('.//INDIVIDUAL'):
    data = {
        "DATAID": individual.findtext("DATAID", ""),
        "VERSIONNUM": individual.findtext("VERSIONNUM", ""),
        "FIRST_NAME": individual.findtext("FIRST_NAME", ""),
        "SECOND_NAME": individual.findtext("SECOND_NAME", ""),
        "THIRD_NAME": individual.findtext("THIRD_NAME", ""),
        "FOURTH_NAME": individual.findtext("FOURTH_NAME", ""),
        "UN_LIST_TYPE": individual.findtext("UN_LIST_TYPE", ""),
        "REFERENCE_NUMBER": individual.findtext("REFERENCE_NUMBER", ""),
        "LISTED_ON": individual.findtext("LISTED_ON", ""),
        "NAME_ORIGINAL_SCRIPT": individual.findtext("NAME_ORIGINAL_SCRIPT", ""),
        "COMMENTS1": individual.findtext("COMMENTS1", ""),
        "NATIONALITY": ", ".join([n.text for n in individual.findall("NATIONALITY/VALUE") if n.text]),
        "LIST_TYPE": ", ".join([l.text for l in individual.findall("LIST_TYPE/VALUE") if l.text]),
        "DATE_OF_BIRTH": individual.findtext("INDIVIDUAL_DATE_OF_BIRTH/YEAR", ""),
        "PLACE_OF_BIRTH": individual.findtext("INDIVIDUAL_PLACE_OF_BIRTH/CITY", "") + ", " +
                          individual.findtext("INDIVIDUAL_PLACE_OF_BIRTH/STATE_PROVINCE", "") + ", " +
                          individual.findtext("INDIVIDUAL_PLACE_OF_BIRTH/COUNTRY", ""),
    }
    individuals.append(data)

# Create DataFrame
df = pd.DataFrame(individuals)
df.to_csv("./csv/unsc_sanctioned_individuals.csv", index=False)

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = "consolidated_unsc.xml"  # Change this to your file path
tree = ET.parse(file_path)
root = tree.getroot()

# Extract all entities
entities = []
for entity in root.findall('.//ENTITY'):
    data = {
        "DATAID": entity.findtext("DATAID", ""),
        "VERSIONNUM": entity.findtext("VERSIONNUM", ""),
        "FIRST_NAME": entity.findtext("FIRST_NAME", ""),
        "SECOND_NAME": entity.findtext("SECOND_NAME", ""),
        "THIRD_NAME": entity.findtext("THIRD_NAME", ""),
        "FOURTH_NAME": entity.findtext("FOURTH_NAME", ""),
        "UN_LIST_TYPE": entity.findtext("UN_LIST_TYPE", ""),
        "REFERENCE_NUMBER": entity.findtext("REFERENCE_NUMBER", ""),
        "LISTED_ON": entity.findtext("LISTED_ON", ""),
        "NAME_ORIGINAL_SCRIPT": entity.findtext("NAME_ORIGINAL_SCRIPT", ""),
        "COMMENTS1": entity.findtext("COMMENTS1", ""),
        "NATIONALITY": ", ".join([n.text for n in entity.findall("NATIONALITY/VALUE") if n.text]),
        "LIST_TYPE": ", ".join([l.text for l in entity.findall("LIST_TYPE/VALUE") if l.text]),
    }
    entities.append(data)

# Create DataFrame for entities
df_entities = pd.DataFrame(entities)
# Save to CSV if needed
df_entities.to_csv("./csv/unsc_sanctioned_individuals_entities.csv", index=False)

## OFAC list

In [None]:
file_path = "./csv/sdn.xml"  # Replace with your actual file path
tree = ET.parse(file_path)
root = tree.getroot()

# Define the namespace (from the provided XML file)
namespace = {'ns': 'https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML'}

# Extract all sdnEntry elements
sdn_entries = []
for entry in root.findall('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}sdnEntry'):
    data = {
        "UID": entry.findtext("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}uid", ""),
        "LAST_NAME": entry.findtext("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}lastName", ""),
        "SDN_TYPE": entry.findtext("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}sdnType", ""),
        "PROGRAMS": ", ".join([p.text for p in entry.findall("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}programList/{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}program") if p.text]),
        "AKA_LIST": "; ".join(
            [f"{aka.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}type', '')} ({aka.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}category', '')}): {aka.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}lastName', '')}"
             for aka in entry.findall("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}akaList/{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}aka")]
        ),
        "ADDRESSES": "; ".join(
            [f"{addr.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}address1', '')}, {addr.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}city', '')}, {addr.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}postalCode', '')}, {addr.findtext('{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}country', '')}"
             for addr in entry.findall("{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}addressList/{https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/XML}address")]
        ),
    }
    sdn_entries.append(data)

# Create DataFrame for sdnEntry elements
df_sdn_entries = pd.DataFrame(sdn_entries)
df_sdn_entries.to_csv("./csv/ofac_list.csv", index=False)

## Interpol

In [None]:
inidviduals = pd.read_csv("./csv/interpol_1.csv")
# json.loads("./csv/interpol_2.json")


In [None]:
len(inidviduals)

## SEC data

### Get all the litigations

In [None]:
# from crewai_tools import ScrapeWebsiteTool

# # To enable scrapping any website it finds during it's execution
# tool = ScrapeWebsiteTool()

# # Initialize the tool with the website URL, 
# # so the agent can only scrap the content of the specified website
# tool = ScrapeWebsiteTool(website_url='https://www.sec.gov/enforcement-litigation/litigation-releases/lr-26233')

# # Extract the text from the site
# text = tool.run()
# print(text)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Configure Selenium with Chrome (Headless Mode)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")

# chrome_options.add_argument("--headless")  # Run without GUI
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")  
# chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Prevent detection
# chrome_options.add_argument("--disable-extensions")
# chrome_options.add_argument("--disable-infobars")
# chrome_options.add_argument("--disable-popup-blocking")
# chrome_options.add_argument("--disable-background-networking")

In [2]:
sec_year_month = pd.DataFrame({"year" : list(range(1996, 2026))})
sec_year_month["month"] = [list(range(1, 13)) for i in range(len(sec_year_month))]
sec_year_month = sec_year_month.explode("month")
sec_year_month.reset_index(inplace=True, drop=True)
sec_year_month["page_url"] = sec_year_month.apply(lambda row: f"https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year={row['year']}&month={row['month']}", axis=1)
sec_year_month = sec_year_month.iloc[:-10]

In [6]:
pd.set_option('display.max_colwidth', None);
sec_year_month.tail(10)

Unnamed: 0,year,month,page_url
340,2024,5,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=5
341,2024,6,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=6
342,2024,7,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=7
343,2024,8,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=8
344,2024,9,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=9
345,2024,10,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=10
346,2024,11,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=11
347,2024,12,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2024&month=12
348,2025,1,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2025&month=1
349,2025,2,https://www.sec.gov/enforcement-litigation/litigation-releases?populate=&year=2025&month=2


In [None]:
service = Service(ChromeDriverManager().install())
def get_page_details(url):
    base_name = url.split("populate=")[-1].replace("&", "_").replace("=", "_")
    file_path = f"./sec_data/sec_data{base_name}.csv"
    if os.path.exists(file_path):
        # print(f"Finished {url}")
        return

    
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(url)
    time.sleep(1)  # Allow time for page to load
    
    # Find the table containing litigation releases
    try:
        table = driver.find_element(By.TAG_NAME, "table")
        rows = table.find_elements(By.TAG_NAME, "tr")
        
        # Extract headers
        headers = [th.text.strip() for th in rows[0].find_elements(By.TAG_NAME, "th")]
    
        # Extract data from rows
        data = []
        for row in rows[1:]:  # Skip header row
            cols = row.find_elements(By.TAG_NAME, "td")
            row_data = [col.text.strip() for col in cols]
            # Extract hyperlink from the first column (if present)
            # print(cols[1].text)
            links = cols[1].find_elements(By.TAG_NAME, "a")
            links = [link.get_attribute("href") for link in links]
            
    
            row_data.append(links)  # Add link as the last column
            data.append(row_data)
    
        # Append "Link" column to headers
        headers.append("Link")
    
        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)
        df.to_csv(file_path, index=False)
        driver.quit()
        print(f"Completed file of {len(df)} for {url}")
    except Exception as e:
        print(f"❌ Error scraping table for:  {url}")
        driver.quit()
        
    
sec_year_month.apply(lambda row: get_page_details(row['page_url']),axis=1)

### Get details of all the litigations

In [3]:
all_lits = [pd.read_csv(file) for file in glob("./sec_data/*csv")]
all_lits_sec = pd.concat(all_lits, ignore_index=True)
print(len(all_lits_sec))
all_lits_sec["year"] = all_lits_sec["Date\nSort descending"].apply(lambda x: int(x.split(',')[1].strip()))
all_lits_sec = all_lits_sec.sort_values(by=["year"], ascending=False)
all_lits_sec.reset_index(inplace=True, drop=True)

11464


In [None]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

for index_, row in tqdm(all_lits_sec.iterrows()):
    try:
        respondents = row["Respondents"].replace('\n', ' ')
        respondents = respondents.translate(str.maketrans('', '', string.punctuation))
        links = row["Link"]
        date = row["Date\nSort descending"]
        links = ast.literal_eval(links)
        # print(links, type(links))
        
        for i, link in enumerate(links):
            file_name = f"./sec_text/{date}_{respondents.split(',')[0][:15]}_link_{i}.txt"
            if os.path.exists(file_name) or ".pdf" in link:
                continue
            driver.get(link)
            # time.sleep(1)  # Allow time for page to load   
            title = driver.title.strip()
            # Extract full page content
            full_content = driver.find_element("tag name", "body").text.strip()
            f = open(file_name, "a")
            f.write(full_content)
            f.close()
    except Exception as e:
        continue

### FCA data

## Fiancial times

In [None]:
service = Service(ChromeDriverManager().install())

### Reuters data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the Selenium WebDriver (e.g., using Chrome)
driver = webdriver.Chrome()  # Ensure the ChromeDriver is in your PATH
query = "bribery and corruption"
# Extract and print the text and URLs
link_text_ls = []
link_url_ls = []

try:
    # Open the Reuters search page
    url = f"https://www.reuters.com/site-search/?query={query.replace(' ','+')}"
    driver.get(url)
    
    # Wait for elements to be present
    elements = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-testid="TitleLink"]'))
    )
    
    
    for element in elements:
        link_text = element.get_attribute('aria-label')
        link_url = element.get_attribute('href')
        # print(f"Link Text: {element.text}")
        # print(f"Link URL: {link_url}")
        link_text_ls.append(element.text)
        link_url_ls.append(link_url)

    driver.quit()
    for page in range(2, 5):
        # Wait for the "Next" button to be present and clickable
        print("click next button")
        time.sleep(random.uniform(2, 5))
        driver = webdriver.Chrome()
        driver.get(f"{url}&offset={page*10}")

       # Optionally, wait for the page to load and perform further actions
        # For example, wait for new page content to load
        elements = WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-testid="TitleLink"]'))
        )
        print(len(elements))

        for element in elements:
            link_text = element.get_attribute('aria-label')
            link_url = element.get_attribute('href')
            link_text_ls.append(element.text)
            link_url_ls.append(link_url)

        
finally:
    # Close the browser
    driver.quit()

if len(link_text_ls)>0:
    result_df = pd.DataFrame({"link_text":link_text_ls, "link_url": link_url_ls})
    print(len(result_df))
    result_df.to_csv(f"./csv/reuters_{query}.csv", index=False)

In [None]:
reuters_files = glob("./csv/reuters_*.csv")
reuters_articles = pd.concat([(pd.read_csv(file)) for file in reuters_files], ignore_index=True)
print(len(reuters_articles))

In [None]:
# To enable scrapping any website it finds during it's execution
tool = ScrapeWebsiteTool()
article_text_ls = []

for index_, row in tqdm(reuters_articles.iterrows()):

    tool = ScrapeWebsiteTool(website_url=row["link_url"])
    # Extract the text from the site
    text = tool.run()
    article_text_ls.append(text)

In [None]:
reuters_articles["article"] = article_text_ls
reuters_articles.to_csv("./csv/all_reuters_scraped.csv", index=False)

### New API

In [None]:
newsapi_files = glob("./csv/query_*.csv")
newsapi_files_articles = pd.concat([(pd.read_csv(file)) for file in newsapi_files], ignore_index=True)

In [None]:
news_api_text = []

for index_, row in tqdm(newsapi_files_articles.iterrows()):
    try:
        tool = ScrapeWebsiteTool(website_url=row["url"])
        # Extract the text from the site
        text = tool.run()
        news_api_text.append(text)
    except Exception as e: 
         print(e)
         news_api_text.append("")

newsapi_files_articles["article"] = news_api_text
newsapi_files_articles = newsapi_files_articles.loc[newsapi_files_articles.article.str.len()>0]
newsapi_files_articles.to_csv("./csv/all_newsapi_scraped.csv", index=False)

In [None]:
reuters_articles.head()