In [1]:
# use if autocompletion is not working
%config Completer.use_jedi = False

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

from fuzzywuzzy import fuzz
from dask_jobqueue import PBSCluster
from dask.distributed import Client, get_worker
from typing import List, Callable, Tuple
from copy import deepcopy
from sys import getsizeof
from time import sleep
from IPython.display import clear_output

import re

In [4]:
from modern_slavery_registry import get_root_path

In [5]:
PROJECT_PATH = get_root_path()
DATA_PATH = os.path.join(PROJECT_PATH, "data")

COMPANIES_JSON = "companies_060421.json"

In [6]:
%%time
companies_info = ""
with open(os.path.join(PROJECT_PATH, DATA_PATH, COMPANIES_JSON),encoding='utf8') as f:
    for i, line in enumerate(f):
        companies_info += line
companies_info = companies_info.replace("\n", "")
companies_info = json.loads(companies_info)

CPU times: user 95.7 ms, sys: 24.1 ms, total: 120 ms
Wall time: 115 ms


In [7]:
print(f"Number of companies: {len(companies_info)}")

Number of companies: 19690


In [8]:
for i, (k,v) in enumerate(companies_info.items()):
    print(k, v)
    if i == 10: break 

755391 {'company': 'Eolus Vind', 'headquarters': 'SE', 'sectors': {'Energy': ['Wind energy']}}
755392 {'company': 'Aquila Capital', 'headquarters': 'DE', 'sectors': {'Finance': ['Finance & banking']}}
755402 {'company': 'Banco Internacional de Moçambique - International Bank of Mozambique', 'headquarters': 'MZ', 'sectors': {'Finance': ['Finance & banking']}}
755404 {'company': 'Fábrica de Explosivos de Moçambique', 'headquarters': 'MZ', 'sectors': {'Chemical': ['Chemical: General']}}
755427 {'company': 'Al Raqeeb Buildings General Contracting Co.', 'headquarters': 'KW', 'sectors': {'Construction & building materials': ['Construction']}}
755459 {'company': 'Abbott Laboratories', 'headquarters': 'US', 'sectors': {'Health Sector': ['Pharmaceutical']}}
755460 {'company': 'Alcoa', 'headquarters': 'US', 'sectors': {'Metals/plastics/basic materials': ['Metals & steel']}}
755461 {'company': 'Anglo American', 'headquarters': 'GB', 'sectors': {'Natural resources': ['Mining']}}
755462 {'company':

In [9]:
companies = [info["company"] for info in companies_info.values()]

## Checking if duplicate companies

In [10]:
dup_comp_idxs = np.arange(len(companies))[pd.DataFrame(companies).duplicated()]
dup_comp = [companies[i] for i in dup_comp_idxs]
len(dup_comp)

58

In [11]:
dup_comp[10]

'Favorita Fruit Company'

## Converting companies as keys and merging info for same companies

In [12]:
companies_info_new = {}
for k, v in tqdm(companies_info.items(), leave=False):
    if v["company"] not in companies_info_new:
        info = {"id": [k], "headquarters": [v["headquarters"]], "sectors": v["sectors"]}
        companies_info_new[v["company"]] = info
    else: 
        info = deepcopy(companies_info_new[v["company"]])
        
        info["id"].append(k)
        info["headquarters"].append(v["headquarters"])
        for k2, v2 in v["sectors"].items():
            if k2 in info["sectors"]:
                info["sectors"][k2] += v2
            else:
                info["sectors"][k2] = v2   
                
#         info["id"] = np.unique(info["id"]).tolist()
#         info["headquarters"] = np.unique(info["headquarters"]).tolist()
        
#         for k3, v3 in info["sectors"].items():
#             info["sectors"][k3] = np.unique(v3).tolist()
        
        companies_info_new[v["company"]] = info

                                         

In [13]:
len(companies_info), len(companies_info_new)

(19690, 19632)

In [14]:
for i, (k,v) in enumerate(companies_info_new.items()):
    print(k, v)
    if i == 10: break 

Eolus Vind {'id': ['755391'], 'headquarters': ['SE'], 'sectors': {'Energy': ['Wind energy']}}
Aquila Capital {'id': ['755392'], 'headquarters': ['DE'], 'sectors': {'Finance': ['Finance & banking']}}
Banco Internacional de Moçambique - International Bank of Mozambique {'id': ['755402'], 'headquarters': ['MZ'], 'sectors': {'Finance': ['Finance & banking']}}
Fábrica de Explosivos de Moçambique {'id': ['755404'], 'headquarters': ['MZ'], 'sectors': {'Chemical': ['Chemical: General']}}
Al Raqeeb Buildings General Contracting Co. {'id': ['755427'], 'headquarters': ['KW'], 'sectors': {'Construction & building materials': ['Construction']}}
Abbott Laboratories {'id': ['755459'], 'headquarters': ['US'], 'sectors': {'Health Sector': ['Pharmaceutical']}}
Alcoa {'id': ['755460'], 'headquarters': ['US'], 'sectors': {'Metals/plastics/basic materials': ['Metals & steel']}}
Anglo American {'id': ['755461'], 'headquarters': ['GB'], 'sectors': {'Natural resources': ['Mining']}}
AstraZeneca {'id': ['75546

In [15]:
companies = list(companies_info_new.keys())

In [16]:
statements = pd.read_excel(os.path.join(PROJECT_PATH, DATA_PATH, "sheets","subset_data.xlsx"))

In [17]:
statements.head()

Unnamed: 0,URL,Company,final_statement,years_with_and_without_act,years_with_act,final_statement_cleaned,len(final_statement_cleaned)
0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,"""K"" Line Holding Europe Limited",66 99 “K” Line Holding (Europe) Limited kM K L...,"['2015', '2019', '2018']","['2015', '2019']",66 99 km sh foor eum hold europe ltd 200 alder...,1873
1,https://1spatial.com/who-we-are/legal/modern-s...,1Spatial Plc,1Spatial Modern Slavery Act Policy Statement H...,['2015'],['2015'],home solution government boundary law enforcem...,2128
2,https://www.shazans.com/slavery-and-human-traf...,1Stop Halal Limited,Slavery and Human Trafficking Statement – Shaz...,"['2015', '2019', '2018']","['2015', '2018']",shazans shazan food continue monitor covid 19 ...,1840
3,https://www.business-humanrights.org/sites/def...,1st Step Solutions Limited,7/28/2019 Modern Slavery Statement 2018 - 1st ...,"['2015', '2019', '2018']","['2015', '2018']",28 2019 2018 statement make pursuant sec 54 20...,1843
4,https://www.2agriculture.com/wp-content/upload...,2 Agriculture Limited,fh Modern Slavery Act 2015: slavery and human ...,"['2015', '2019', '2018']","['2015', '2018']",fh 2015 introduction uk act require business s...,1372


## Processing company names (str) for better matching

In [18]:
statements_companies = deepcopy(statements["Company"].to_frame())
statements_companies.drop_duplicates(inplace=True)
statements_companies["processed_company"] = statements_companies["Company"].apply(lambda x: x.lower().strip())

json_companies = pd.DataFrame(data=[(comp, comp.lower().strip()) for comp in companies], columns=["json", "processed_company"])

len(statements), len(statements_companies), len(companies_info), len(json_companies)

(9993, 8177, 19690, 19632)

## Filter out companies for which we got exact match and for which we require fuzzy matching

In [19]:
exact_match_df = pd.merge(left=statements_companies, right=json_companies, on="processed_company", how="inner")
fuzzy_match_df = statements_companies[~statements_companies["Company"].isin(exact_match_df["Company"])]

len(statements_companies), len(exact_match_df), len(fuzzy_match_df)

(8177, 5922, 2255)

In [20]:
exact_match_df.head()

Unnamed: 0,Company,processed_company,json
0,1Spatial Plc,1spatial plc,1Spatial Plc
1,1Stop Halal Limited,1stop halal limited,1Stop Halal Limited
2,1st Step Solutions Limited,1st step solutions limited,1st Step Solutions Limited
3,2 Agriculture Limited,2 agriculture limited,2 Agriculture Limited
4,22 Bishopsgate (Devco) Limited,22 bishopsgate (devco) limited,22 Bishopsgate (Devco) Limited


In [21]:
fuzzy_match_df.head()

Unnamed: 0,Company,processed_company
0,"""K"" Line Holding Europe Limited","""k"" line holding europe limited"
6,2 Moto Limited,2 moto limited
7,2 Sisters Food Group Limited,2 sisters food group limited
11,3M Company,3m company
12,3M United Kingdom plc,3m united kingdom plc


## Add additional info for exact matches

In [22]:
exact_match_list = []
for r1 in tqdm(exact_match_df.iterrows(), leave=False):

    r1 = r1[1:][0].to_dict()
    
    for r2 in statements[statements["Company"]==r1["Company"]].iterrows():
        info = r2[1:][0].to_dict()
        
        additional_info = deepcopy(companies_info_new[r1["json"]])
        additional_info["matched_company"] = r1["json"]
        
        info["additional_info"] = additional_info
        
        exact_match_list.append(info)

                          

In [23]:
len(exact_match_list)

7198

In [24]:
exact_match_list[100]

{'URL': 'https://www.asmtech.com/wp-content/uploads/2018/06/slavery.pdf',
 'Company': 'ASM Technologies Limited',
 'final_statement': 'ASM Technologies Limited Anti-Slavery and Human Trafficking Statement We are committed to improving our practices to combat slavery and human trafficking. The Modern Slavery Act 2015 (the “Act”) requires us to disclose information regarding our efforts to assist with the eradication of slavery and human trafficking from our supply chain and within our own business. Our Structure We deliver agile technology distribution across the IT channel concentrating on Tier 2 and 3 brands. ASM Technologies Limited has a 100% subsidiary, ASM Technologies GmbH, based in Germany. We have a combined workforce of 70 employees, principally based and operating out of the UK. Our Policies on slavery and human trafficking Slavery and human trafficking are illegal and a violation of human rights. There are many forms of modern slavery, which involve a person losing their fre

In [25]:
# fuzz.ratio("Catherine M. Gitau","Gitau Catherine"), fuzz.partial_ratio("Catherine M. Gitau","Gitau Catherine"), fuzz.token_sort_ratio("Catherine M. Gitau", "Gitau Catherine")

## Fuzzy Matching

In [26]:
fuzzy_match_df.head()

Unnamed: 0,Company,processed_company
0,"""K"" Line Holding Europe Limited","""k"" line holding europe limited"
6,2 Moto Limited,2 moto limited
7,2 Sisters Food Group Limited,2 sisters food group limited
11,3M Company,3m company
12,3M United Kingdom plc,3m united kingdom plc


In [27]:
%%time
num_best_match = 5
best_matches = {}

for comp1 in tqdm(fuzzy_match_df["processed_company"].values[:10], leave=False):
    scores = [
        fuzz.partial_ratio(comp1, comp2.lower().strip()) + 
        fuzz.token_set_ratio(comp1, comp2.lower().strip()) for comp2 in companies]
    best_match_idxs = np.argsort(scores)[-num_best_match:][::-1]
    best_matches[comp1] = {companies[idx]: scores[idx]//2 for idx in best_match_idxs}

                                               

CPU times: user 8.08 s, sys: 21.1 ms, total: 8.1 s
Wall time: 8.09 s




In [28]:
best_matches

{'"k" line holding europe limited': {'K Line': 91,
  'TWG Europe Limited': 88,
  'NDK Europe Limited': 88,
  'KCG Europe Limited': 88,
  'EE Limited': 86},
 '2 moto limited': {'M53 Motors Limited': 83,
  'Otis Limited': 82,
  'Moto Investments Limited': 81,
  'QA Limited': 81,
  'EE Limited': 81},
 '2 sisters food group limited': {'2 Sisters Food Group': 100,
  'JS Group Limited': 89,
  'HF Group Limited': 89,
  'FK Group Limited': 89,
  'SC Group Limited': 89},
 '3m company': {'3M': 100,
  'National Gypsum Company (New NGC)': 86,
  'Siberian Urals Aluminium Company (SUAL)': 86,
  'Zoeller Pump Company': 86,
  'Ndola Lime Company': 86},
 '3m united kingdom plc': {'3M': 100,
  'EPC United Kingdom plc': 91,
  'CPM United Kingdom Limited': 83,
  'CSM (United Kingdom) Limited': 83,
  'Sudzucker United Kingdom Limited': 80},
 '3i group plc': {'3i': 100,
  'Ei Group plc': 92,
  'API Group plc': 90,
  'FIH Group Plc': 90,
  'Christie Group plc': 89},
 '4imprint group plc': {'Ei Group plc': 84

## Using HPC

In [29]:
N_WORKERS = 16 # number of parallel threads 
CORES = 64 # cores_per_worker = cores/ n_workers
MEMORY = "64GB" # memory_per_worker = memory/ n_workers

cluster = PBSCluster(
    n_workers=N_WORKERS,
    cores=CORES,
    memory=MEMORY)

cluster

VBox(children=(HTML(value='<h2>PBSCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

In [30]:
client = Client(cluster)

In [31]:
%%time
def test(a, b):
    return a + b
results_future = {}
for i in range(10):
    results_future[i] = client.submit(test, a=10+i, b=10+i)
results = client.gather(results_future)

CPU times: user 271 ms, sys: 29.3 ms, total: 301 ms
Wall time: 9.74 s


In [32]:
# cluster.scale(n=16, jobs=4, memory=48, cores=48)

In [33]:
%%time
def test(a, b):
    return a + b
results_future = {}
for i in range(1000):
    results_future[i] = client.submit(test, a=10+i, b=10+i)
results = client.gather(results_future)

CPU times: user 651 ms, sys: 62.4 ms, total: 713 ms
Wall time: 674 ms


In [34]:
len(companies_info), len(companies), getsizeof(companies)/(1024*1024)

(19690, 19632, 0.14983367919921875)

In [35]:
def distributed_match(
    comp1:str,
    companies: List[str], 
    num_best_match: int) -> List[int]:
    
    scores = [
        fuzz.partial_ratio(comp1, comp2.lower().strip()) + 
        fuzz.token_set_ratio(comp1, comp2.lower().strip()) for comp2 in companies]
    best_match_idxs = np.argsort(scores)[-num_best_match:][::-1]
    return [{companies[idx]: scores[idx]//2 for idx in best_match_idxs}]

In [36]:
[broadcasted_companies] = client.scatter(deepcopy([companies]), broadcast=True)

In [37]:
results_future = {}
n = 5
for comp in tqdm(fuzzy_match_df["processed_company"].values, leave=False):
    results_future[comp] = client.submit(
        distributed_match, 
        comp1=comp,
        companies=broadcasted_companies, 
        num_best_match=n)

                                                     

In [38]:
# len(client.get_versions(check=True)["workers"])

In [39]:
# client.close()
# client.cancel(results_future)

In [40]:
statuses = pd.DataFrame([result.status for result in results_future.values()]).value_counts()
while ("finished" not in statuses) or (statuses["finished"] < len(results_future)):
    clear_output(wait=True)
    print(pd.DataFrame([result.status for result in results_future.values()]).value_counts())
    sleep(5)

finished    2255
dtype: int64


KeyboardInterrupt: 

In [41]:
results_gathered = {comp: client.gather(results_future[comp])[0] for comp in tqdm(results_future.keys(), leave=False) if results_future[comp].status=="finished"}

                                                    

In [42]:
len(results_future), len(results_gathered)

(2255, 2255)

### Checking number of successful (above threshold) fuzzy matches 

In [69]:
min_match_score = 90
num_matches = 0
for comp, matches in results_gathered.items():
    if list(matches.values())[0] >= min_match_score: num_matches += 1

num_matches

1360

In [44]:
for i, (comp, matches) in enumerate(results_gathered.items()):
    print(comp, matches)
    if i ==10: break

"k" line holding europe limited {'K Line': 91, 'TWG Europe Limited': 88, 'NDK Europe Limited': 88, 'KCG Europe Limited': 88, 'EE Limited': 86}
2 moto limited {'M53 Motors Limited': 83, 'Otis Limited': 82, 'Moto Investments Limited': 81, 'QA Limited': 81, 'EE Limited': 81}
2 sisters food group limited {'2 Sisters Food Group': 100, 'JS Group Limited': 89, 'HF Group Limited': 89, 'FK Group Limited': 89, 'SC Group Limited': 89}
3m company {'3M': 100, 'National Gypsum Company (New NGC)': 86, 'Siberian Urals Aluminium Company (SUAL)': 86, 'Zoeller Pump Company': 86, 'Ndola Lime Company': 86}
3m united kingdom plc {'3M': 100, 'EPC United Kingdom plc': 91, 'CPM United Kingdom Limited': 83, 'CSM (United Kingdom) Limited': 83, 'Sudzucker United Kingdom Limited': 80}
3i group plc {'3i': 100, 'Ei Group plc': 92, 'API Group plc': 90, 'FIH Group Plc': 90, 'Christie Group plc': 89}
4imprint group plc {'Ei Group plc': 84, 'S Group': 84, 'GB Group plc': 84, 'Yu Group plc': 84, 'IFG Group plc': 83}
a m 

In [45]:
fuzzy_match_df.head()

Unnamed: 0,Company,processed_company
0,"""K"" Line Holding Europe Limited","""k"" line holding europe limited"
6,2 Moto Limited,2 moto limited
7,2 Sisters Food Group Limited,2 sisters food group limited
11,3M Company,3m company
12,3M United Kingdom plc,3m united kingdom plc


In [46]:
best_matches = []
for comp, matches in tqdm(results_gathered.items(), leave=False):
    best_match, best_match_score = list(matches.keys())[0], list(matches.values())[0] 
    if best_match_score >= min_match_score: best_matches.append((comp, best_match))
best_matches = pd.DataFrame(best_matches, columns=["processed_company", "json"])

fuzzy_match_df = pd.merge(fuzzy_match_df, best_matches, on="processed_company", how="left")
fuzzy_match_df.head()

                                        

Unnamed: 0,Company,processed_company,json
0,"""K"" Line Holding Europe Limited","""k"" line holding europe limited",K Line
1,2 Moto Limited,2 moto limited,
2,2 Sisters Food Group Limited,2 sisters food group limited,2 Sisters Food Group
3,3M Company,3m company,3M
4,3M United Kingdom plc,3m united kingdom plc,3M


In [68]:
sum(~fuzzy_match_df["json"].isna())

1360

In [47]:
fuzzy_match_list = []
for r1 in tqdm(fuzzy_match_df.iterrows(), leave=False):

    r1 = r1[1:][0].to_dict()
    
    # picking all entries from statement df with same company
    for r2 in statements[statements["Company"]==r1["Company"]].iterrows():
        info = r2[1:][0].to_dict()
        
        if str(r1["json"])!="nan":
            additional_info = deepcopy(companies_info_new[r1["json"]])
            additional_info["matched_company"] = r1["json"]
        else:
            additional_info = None
        
        info["additional_info"] = additional_info
        
        fuzzy_match_list.append(info)

                          

In [48]:
fuzzy_match_list[0]

{'URL': 'https://img1.wsimg.com/blobby/go/7695baff-3f0f-44e8-8390-94fa7eb0e674/downloads/2019%20%20modern%20slavery.pdf?ver=1554998068059',
 'Company': '"K" Line Holding Europe Limited',
 'final_statement': '66 99 “K” Line Holding (Europe) Limited kM K LINE sh Foor eum HOLDING (EUROPE) LTD. 200 Aldersgate Street London ECIA 4HD Tel: 020 7382 6500 E-mail: keulongen@uk.kline.com Modern Slavery Act Transparency Statement crane lineure eee Published: 22 March 2019 The UK Modern Slavery Act 2015 requires large entities carrying on a business in the UK to publish a statement detailing their efforts (if any) to combat human trafficking and modern-day slavery. This statement relates to actions and activities during the financial year 1 April 2018 to 31 March 2019. As part of the shipping industry, “K” Line Group recognizes that it has a responsibility to take a robust approach to slavery and human trafficking. "K” Line Group is absolutely committed to preventing slavery and human trafficking i

In [49]:
len(fuzzy_match_list), len(fuzzy_match_df)

(2795, 2255)

In [50]:
complete_match_list = exact_match_list + fuzzy_match_list
len(complete_match_list), len(statements)

(9993, 9993)

In [51]:
complete_match_list_new = []
for info in tqdm(complete_match_list, leave=False):
    modified_info = deepcopy(info)
    
    modified_info["years_with_act"] = [re.sub("\D","",year) for year in modified_info["years_with_act"].split(",") if re.sub("\D","",year)!= ""]
    modified_info["years_with_act"] = [int(year) for year in modified_info["years_with_act"]] 
    
    
    modified_info["years_with_and_without_act"] = [re.sub("\D","",year) for year in modified_info["years_with_and_without_act"].split(",") if re.sub("\D","",year)!= ""]
    modified_info["years_with_and_without_act"] = [int(year) for year in modified_info["years_with_act"]]
    
    complete_match_list_new.append(modified_info)
    
del complete_match_list

                                                      

In [52]:
len(complete_match_list_new)

9993

In [53]:
complete_match_list_new[0]

{'URL': 'https://1spatial.com/who-we-are/legal/modern-slavery-act-policy-statement/',
 'Company': '1Spatial Plc',
 'final_statement': '1Spatial Modern Slavery Act Policy Statement Home Solutions Solutions Government Boundaries Law Enforcement Rural Payments Next Generation 911 Utilities ArcGIS Utility Network Accelerator Traffic Management Plan Automation (TMPA) Automated Schematics Pipe Inference Incident Management Leakage Solutions Transport and Infrastructure Supply Chain Data Quality Highway Performance Monitoring System Next Generation 911 Services Consultancy Support Training 1Spatial Training FME Training Geocortex Training "1Spatial provides excellent knowledge and support to meet our current business needs. Individuals also go the extra mile in responding to critical issues and timescales, which is always very much appreciated and essential to the continued success of our business." Danny O\'Reilly | DAERA Industries Industries Utilities Water and Wastewater Gas Electricity T

In [54]:
with open(os.path.join(DATA_PATH, 'subset_data_with_additional_info.json'), 'w') as f:
    json.dump(complete_match_list_new, f)

In [55]:
client.close()

In [59]:
fuzzy_matches = deepcopy(fuzzy_match_df[["Company", "json"]].dropna())
fuzzy_matches.columns = ["Company", "Match"]
fuzzy_matches.head()

Unnamed: 0,Company,Match
0,"""K"" Line Holding Europe Limited",K Line
2,2 Sisters Food Group Limited,2 Sisters Food Group
3,3M Company,3M
4,3M United Kingdom plc,3M
5,3i Group plc,3i


In [61]:
fuzzy_matches.to_excel(
    os.path.join(DATA_PATH,"sheets","fuzzy_matches.xlsx"),
    index=False)