In [1]:
import re
import json
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
from tqdm import tqdm
import matplotlib.pyplot as plt
import string
from glob import glob
import random
%matplotlib inline

### AI generated summaries of sec data and reuters

Add the original source so that we can attribute the original articles

#### SEC summary

In [None]:
def get_file_name(row):
    respondents = row["Respondents"].replace('\n', ' ')
    respondents = respondents.translate(str.maketrans('', '', string.punctuation))
    links = row["Link"]
    date = row["Date\nSort descending"]
    file_name = f"./sec_text/{date}_{respondents.split(',')[0][:15]}_link_"
    return file_name


all_lits = [pd.read_csv(file) for file in glob("./sec_data/*csv")]
all_lits_sec = pd.concat(all_lits, ignore_index=True)
all_lits_sec["year"] = all_lits_sec["Date\nSort descending"].apply(lambda x: int(x.split(',')[1].strip()))
all_lits_sec = all_lits_sec.sort_values(by=["year"], ascending=False)
all_lits_sec["file_name"] = all_lits_sec.apply(get_file_name, axis=1)
all_lits_sec.reset_index(inplace=True, drop=True)
fName_link_map = {}
for k,v in zip(list(all_lits_sec["file_name"]), list(all_lits_sec["Link"])):
    fName_link_map[k] = v

In [None]:
sec_summaries = []
x = string.punctuation
x = x.replace('_', "")

for json_file in glob("sec_json/*.json"):
    with open(json_file) as f:
        base_file_name = json_file.replace(".txt.json", "").replace("sec_json", "./sec_text")[:-1]
        # print(base_file_name)
        splits = base_file_name.split(", ")
        base_file_name = splits[0]+", "+splits[1].translate(str.maketrans('', '', x))
        original_article_links = fName_link_map.get(base_file_name, "")
        if len(original_article_links) ==0:
            try_again = base_file_name.replace("_link_", "")
            original_article_links = all_lits_sec.loc[all_lits_sec.file_name.str.contains(try_again)]["Link"].values[0]
            # print(original_article_links)
        d = json.load(f)
        sec_summaries.append(f"Litigants\n{d['litigants']}\nSummary\n{d['summary']}\nLinks to original article\n{original_article_links}")

#### Reuters News API

In [None]:
def get_file_name(row):
    file = row['link_url'].split("/")[-1]
    if len(file) ==0:
        file = row['link_url'].split("/")[-2]
    if len(file) >50:
        file = file[:50]
    return file

csv_files = ["/Users/abhinav.sunderrajan/Desktop/finacial_fraudsters/csv/all_newsapi_scraped.csv",
                 "/Users/abhinav.sunderrajan/Desktop/finacial_fraudsters/csv/all_reuters_scraped.csv"]
newapi_reuters_df = pd.concat([pd.read_csv(file)[['link_url', 'article']] for file in csv_files], ignore_index=True)
newapi_reuters_df["file_name"] = newapi_reuters_df.apply(lambda row: get_file_name(row), axis=1)

fName_link_map = {}
for k,v in zip(list(newapi_reuters_df["file_name"]), list(newapi_reuters_df["link_url"])):
    fName_link_map[k] = v


news_api_reuters = []
for json_file in glob("reuter_newsapi_json/*.json"):
    with open(json_file) as f:
        d = json.load(f)
        if len(d['summary']) == 0:
            continue
        json_file_name = json_file.strip(".json").split("/")[1][:45]
        tmp_df = newapi_reuters_df.loc[newapi_reuters_df.file_name.str.contains(json_file_name)]
        if(len(tmp_df)) == 0:
            print(json_file)
            continue
            
        original_article_links = tmp_df.link_url.values[0]
        news_api_reuters.append(f"Article category\n{d['article_category']}\nPeople mentioned\n{d['people_mentioned']}\nSummary\n{d['summary']}\nLinks to original article\n{original_article_links}")

In [None]:
print(news_api_reuters[0])

### MAS scraped data

In [None]:
mas_data = pd.read_csv("./csv/mas_enforcement_actions.csv")
mas_data["economic_offense_strip"] = mas_data.economic_offense.apply(lambda x: x.split("Published Date:")[1].split("Subscribe to Updates")[0].strip("Monetary Authority of Singapore\n \n "))

In [None]:
def get_mas_input(row):
    return f"""
    Person/Company
    {row['Person/Company']}
    Action Type
    {row['Action Type']}
    Title
    {row['Title']}
    Article
    {row['economic_offense_strip']}
    Links to original article
    {row['url_link']}
    """
mas_data["mas_data"] = mas_data.apply(lambda row:get_mas_input(row), axis=1)

### UNSC sanctions

In [2]:
def get_unsc_inidvidual_details(row):
    person = row["FIRST_NAME"]+" "+row["SECOND_NAME"]
    person = person.title()
    org_or_country = row["UN_LIST_TYPE"]
    nationality = row["NATIONALITY"]
    year_of_birth = int(row["DATE_OF_BIRTH"])
    if year_of_birth == 0:
        year_of_birth = "an unknown date"
    info = row["COMMENTS1"]
    if org_or_country == nationality:
        org_or_country = ""
    else:
        org_or_country = f"belonging to {org_or_country}"    
    return f"{person} born on {year_of_birth} is a UNSC sanctioned individual listed on {row['LISTED_ON']} from {nationality} {org_or_country}. More details:\n{info}"

def get_unsc_entity_details(row):
    org_name = row["FIRST_NAME"].title()
    org_affliation = row["UN_LIST_TYPE"]
    info = row["COMMENTS1"]
    return f"{org_name} is a UNSC sanctioned organization listed on {row['LISTED_ON']} affiliated with {org_affliation}. More details:\n{info}"



In [3]:
unsc_individual = pd.read_csv("./csv/unsc_sanctioned_individuals.csv")
unsc_individual["SECOND_NAME"] = unsc_individual["SECOND_NAME"].fillna("")
unsc_individual["THIRD_NAME"] = unsc_individual["THIRD_NAME"].fillna("")
unsc_individual["DATE_OF_BIRTH"] = unsc_individual["DATE_OF_BIRTH"].fillna(0)
unsc_individual["COMMENTS1"] = unsc_individual["COMMENTS1"].fillna(0)       
unsc_individual["consolidated_string"] = unsc_individual.apply(get_unsc_inidvidual_details, axis=1)

In [4]:
unsc_entity = pd.read_csv("./csv/unsc_sanctioned_individuals_entities.csv")
unsc_entity["COMMENTS1"] = unsc_entity["COMMENTS1"].fillna(0)
unsc_entity["consolidated_string"] = unsc_entity.apply(get_unsc_entity_details, axis=1)

In [7]:
print(unsc_entity.sample(1).consolidated_string.values[0])

Al-Qaida In Iraq is a UNSC sanctioned organization listed on 2004-10-18 affiliated with Al-Qaida. More details:
Review pursuant to Security Council resolution 1822 (2008) was concluded
on 25 May 2010. 
Review pursuant to Security Council resolution 2368 (2017) was concluded on 24 November 2020.

 INTERPOL-UN Security Council Special Notice web link:https://www.interpol.int/en/How-we-work/Notices/View-UN-Notices-Individuals


### Interpol data

In [8]:
country_code = pd.read_csv("./csv/country_code.csv")
country_code = country_code[["name", "alpha-2"]]
country_code = country_code.loc[~country_code["alpha-2"].isna()]
country_code["alpha-2"] = country_code["alpha-2"].apply(lambda x: x.lower())

code_dict = {}
for key, val in zip(list(country_code["alpha-2"]), list(country_code["name"])):
    code_dict[key] = val

def get_interpol_details(row):
    person = row["name"]
    person = person.title()
    nationality = row["countries"]  
    nationality_ls = nationality.split(";")
    nationality_ls = [code_dict.get(nation, "") for nation in nationality_ls]
    nationality = ";".join(nationality_ls)
    interpol_notice = row["dataset"]
    info = row["sanctions"]
    return f"{person} from {nationality} has an {interpol_notice}. More details:\n{info}"

inidviduals = pd.read_csv("./csv/interpol_1.csv")
inidviduals["countries"] = inidviduals["countries"].fillna("")
inidviduals["interpol_details"] = inidviduals.apply(get_interpol_details, axis=1)

In [10]:
print(inidviduals["interpol_details"].sample(1).values[0])

Sebastian Enrique Marset Cabrera from Uruguay has an INTERPOL Red Notices. More details:
Red Notice - TRAFICO ILICITO DE ESTUPEFACIENTES Y DROGAS PELIGROSAS, ASOCIACION CRIMINAL Y LAVADO DE DINERO.


### OFAC data

In [None]:
def ofac_details(row):
    name = row["LAST_NAME"].lower().title()
    sdn_type = row["SDN_TYPE"]
    program = row["PROGRAMS"]
    address = row["ADDRESSES"]
    return f"{name} is a OFAC santioned {sdn_type} with program {program} with address {address}"
    
    

ofac_list = pd.read_csv("./csv/ofac_list.csv")
ofac_list.fillna("", inplace=True)
ofac_list["ofac_details"] = ofac_list.apply(ofac_details, axis=1)

### Collate all documents

In [None]:
all_docs = []
all_docs.extend(sec_summaries)
all_docs.extend(news_api_reuters)
all_docs.extend(list(mas_data["mas_data"]))
all_docs.extend(list(unsc_individual["consolidated_string"]))
all_docs.extend(list(unsc_entity["consolidated_string"]))
all_docs.extend(list(inidviduals["interpol_details"]))
all_docs.extend(list(ofac_list["ofac_details"]))
len(all_docs)

In [None]:
all_docs = [doc for doc in all_docs if len(doc) > 10]
len(all_docs)

### Vector encoding

In [None]:
import ollama
import faiss

In [None]:
pd.DataFrame({"all_docs":all_docs}).to_csv("./csv/all_docs.csv", index=False)

In [None]:
dimension = np.array(ollama.embeddings(model='nomic-embed-text', 
                                       prompt='The sky is blue because of rayleigh scattering').embedding).shape[0]
index = faiss.IndexFlatL2(dimension)

In [None]:
all_embeddings = []
for doc in tqdm(all_docs):
    all_embeddings.append(ollama.embeddings(model='nomic-embed-text', prompt=doc).embedding)
    # index.add(.expand_dims(x, axis=0))


In [None]:
all_embeddings = [embed for embed in all_embeddings if len(embed) ==768]
print(len(all_embeddings))
np.save('all_embeddings.npy', np.array(all_embeddings)) # save

#### Load the numpy array

In [None]:
all_embeddings = np.load('all_embeddings.npy') # load
index.add(all_embeddings)

In [None]:
query = "Ponzi scheme"
query_embedding = np.array(ollama.embeddings(model='nomic-embed-text',prompt=query).embedding)
query_embedding = np.expand_dims(query_embedding, axis=0)

In [None]:


# Search for top 3 similar articles
D, I = index.search(query_embedding, k=5)

# Print results
for idx in I[0]:
    print(f"=========== Similar News ==========\n: {all_docs[idx]}")