Text Processing and Cleaning
* Combining statements from Edgar and Nitin
* removing stopwords, special characters
* removing company name from the statement - compant name in the statement is not relevant for textual analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import concurrent.futures
from tqdm import tqdm
from typing import Optional, List, Dict, Callable
from urlextract import URLExtract
import nltk
import re
from matplotlib import pyplot as plt
from nltk.corpus import stopwords

from collections import Counter
from copy import deepcopy
from typing import Dict, Tuple

In [3]:
from modern_slavery import get_root_path
from modern_slavery.text_parser_v2 import (
    word_expantions,
    find_urls_in_text,
    CustomWordNetLemmatizer
)

In [4]:
DATA_PATH = os.path.join(get_root_path(), "data")
SHEETS_PATH = os.path.join(DATA_PATH,  "sheets")

In [5]:
df = pd.read_csv(
    os.path.join(SHEETS_PATH, "modern_slavery_dataset.csv"))

print(f"Number of rows before removing rows with NULL statements: {len(df)}")
df.dropna(subset=["Text"], inplace=True, axis=0)
print(f"Number of rows after removing rows with NULL statements: {len(df)}")

print(f"Number of rows before removing duplicate rows: {len(df)}")
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
print(f"Number of rows after removing duplicate rows: {len(df)}")

df["Text"] = df["Text"].astype(str)
df.head()

Number of rows before removing rows with NULL statements: 28417
Number of rows after removing rows with NULL statements: 18622
Number of rows before removing duplicate rows: 18622
Number of rows after removing duplicate rows: 18621


Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text
0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


## Getting level of data

In [6]:
df.nunique(axis=0)

Company ID                                      13207
Company                                         13207
Is Publisher                                        2
Statement ID                                    12181
URL                                             10827
Override URL                                        3
Companies House Number                          10367
Industry                                           71
HQ                                                 64
Is Also Covered                                     2
UK Modern Slavery Act                               2
California Transparency in Supply Chains Act        2
Australia Modern Slavery Act                        2
Period Covered                                     17
Text                                            10025
dtype: int64

In [7]:
print(f"Number of rows: {len(df)}")
cols = ["Company ID", "Statement ID"]
for i in range(1, len(cols)+1):
    print(f'Unique entries across cols {cols[:i]}: {len(df.drop_duplicates(cols[:i]))}')

Number of rows: 18621
Unique entries across cols ['Company ID']: 13207
Unique entries across cols ['Company ID', 'Statement ID']: 18621


### <font color="blue">Level of data = ["Company ID", "Statement ID"]</font>

## Adding `Row ID` as unique identifier of each row

In [8]:
df.insert(loc=0, column='Row ID', value=df.index, allow_duplicates=False)
df.head()

Unnamed: 0,Row ID,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text
0,0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1,1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2,2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3,3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4,4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


# Text Cleaning

In [9]:
%%time
# mss: modern_slavery_statement
def is_mss(statement: str) -> bool:
    """Classify whether given statement is modern slavery statement or not."""
    if re.findall(
        r'modern|slavery|modern slavery|'
        'supply chain|protect|human|trafficking|'
        'supply|business|suppliers|statement|risk'
        'act|chain|rights|labour',
        statement.lower()):
        return True
    return False

is_statement = df["Text"].apply(lambda x: is_mss(x))

def filter_mss_and_generate_vocab(
    df: pd.core.frame.DataFrame, 
    bool_idxs: pd.core.series.Series) -> Tuple[pd.core.frame.DataFrame, Dict[str, int]]:
    """Filter statements by given boolean series."""
    df = deepcopy(df[bool_idxs].reset_index(drop=True))
    vocab = Counter()
    for statement in tqdm(df['Text'].values, leave=False, position=0):
        for word in statement.lower().split():
            vocab.update({word:1})

    for word in stopwords.words('english'):
        del vocab[word]

    vocab = {word: count for word, count in sorted(
        vocab.items(), key=lambda x: x[1], reverse=True)}
    return df, vocab

statement_df, statement_vocab = filter_mss_and_generate_vocab(df, is_statement)
print(f"Number of statements: {len(statement_df)}")
print(f"Length of vocab: {len(statement_vocab)}")

noise_df, noise_vocab = filter_mss_and_generate_vocab(df, ~is_statement)
print(f"Number of noisy statements: {len(noise_df)}")
print(f"Length of noisy vocab: {len(noise_vocab)}")

                                                       

Number of statements: 17268
Length of vocab: 148179


                                        

Number of noisy statements: 1353
Length of noisy vocab: 11115
CPU times: user 18.6 s, sys: 156 ms, total: 18.7 s
Wall time: 18.7 s




In [10]:
# df.iloc[18616]["Text"]

In [11]:
# is_statement

In [12]:
def clean_statement(
    statement: str,
    url_extractor: Optional[URLExtract]=None, 
    lemmatizer: Optional[Callable[[str], str]]=None,
    fix_expantions: bool=True,
    remove_stopwords: bool=True) -> str:
    """Clean given statement.
    
    1. removing urls
    2. lowercasing 
    3. lemmatizing
    4. removing stopwords
    
    Args:
        statement : modern slavery statement
        
    Returns:
        Cleaned version of statement
    """
    if statement:
        statement = statement.lower()
    
    # removing urls within statement
    if statement and url_extractor is not None:
        for url in url_extractor.find_urls(statement):
            statement = statement.replace(url, " ")
            
    # removing sentences within statement with following keywords
    statement = " ".join([
        sentence for sentence in nltk.tokenize.sent_tokenize(statement) if not re.findall(
        (r"cookie|newsletter|facebook|twitter|signup|"
        "subscribe|download|where are you travelling|do you|would you|"
         "latest jobs|jacket|jackets|\?"), 
        sentence)])
    
    # lemmatizing sentence 
    if statement and lemmatizer is not None:
        statement = lemmatizer(statement)
        
    cleaned_statement = []
    if statement:
        for token in statement.split():
            if fix_expantions and word_expantions.get(token):
                token = word_expantions.get(token)
            if remove_stopwords and token in stopwords.words('english'):
                continue
            cleaned_statement.append(token)
            
    return " ".join(cleaned_statement)

In [13]:
# from modern_slavery.utils import nltk_resource_downloader

In [14]:
# url_extractor = URLExtract(extract_email=True)
# lemmatizer = CustomWordNetLemmatizer()
# clean_statement("www.google.com abc www.fb.com better nice playing", url_extractor, lemmatizer=lemmatizer.lemmatize)

In [15]:
# nltk_resource_downloader("stopwords")

In [16]:
%%time
futures = {}
results = {}

url_extractor = URLExtract(extract_email=True)
lemmatizer = CustomWordNetLemmatizer()

with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()-5) as executor:
    for row_id, statement in statement_df[["Row ID", "Text"]].values:
        futures[executor.submit(
            clean_statement, 
            statement, 
            url_extractor, 
            lemmatizer.lemmatize,
        )] = row_id

    for future in tqdm(concurrent.futures.as_completed(futures), leave=False, position=0):
        results[futures[future]] = future.result()
        
cleaned_statements_df = pd.DataFrame.from_dict(results, orient="index").reset_index()
cleaned_statements_df.columns = ["Row ID", "Cleaned Text"]
cleaned_statements_df.sort_values(["Row ID"], inplace=True)
cleaned_statements_df.reset_index(drop=True, inplace=True)
if "Cleaned Text" in statement_df.columns:
    statement_df.drop("Cleaned Text", axis=1, inplace=True)
statement_df = pd.merge(statement_df, cleaned_statements_df, on="Row ID", how="left")
del futures, results, url_extractor, lemmatizer, cleaned_statements_df

print(f"Number of rows: {len(statement_df)}")

                          

Number of rows: 17268
CPU times: user 9.91 s, sys: 3.36 s, total: 13.3 s
Wall time: 8min 54s


In [17]:
statement_df.to_csv(os.path.join(SHEETS_PATH, "modern_slavery_dataset_clean_02062022.csv"), index=False)

In [18]:
statement_df.head()

Unnamed: 0,Row ID,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text,Cleaned Text
0,0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,k line holdinc ( europe ) ltd. modern slavery ...
1,1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,k line holdinc ( europe ) ltd. modern slavery ...
2,2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,k line holdinc ( europe ) ltd. modern slavery ...
3,3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,k line holdinc ( europe ) ltd. modern slavery ...
4,4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...,k line holdinc ( europe ) ltd. modern slavery ...


## Analyzing statements

### Checking number of words in a statement as a proxy to remove non-mss



In [19]:
# word_counts = statement_df["Cleaned Text"].apply(
#     lambda x: len(x.split()) if x is not None and isinstance(x, str) else 0)
# word_counts.sort_values(inplace=True)

# smallest = 1000
# largest = 50
# fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(16, 4))
# plt.suptitle("Distribution of word counts", x=.5, y=1.05)
# word_counts.plot.box(ax=ax1)
# ax1.set_title(f"All statments")

# word_counts.iloc[:smallest].plot.hist(ax=ax2)
# ax2.set_title(f"Smallest {smallest} statments")

# word_counts.iloc[-largest:].plot.hist(ax=ax3, bins=50)
# ax3.set_title(f"Largest {largest} statments")

# plt.show()

In [20]:
# min_words = 100
# max_words = 10000
# print(f"Number of statements with word count <= {min_words}: {len(word_counts[word_counts<=min_words])}")
# print(f"Number of statements with word count > {max_words}: {len(word_counts[word_counts>max_words])}")

In [21]:
# statement_df["Cleaned Text"] = statement_df["Cleaned Text"].apply(lambda x: x if x else "#NA")
# statement_df["Cleaned Text"] = statement_df["Cleaned Text"].apply(lambda x: x if smallest <= len(x.split()) < largest else "#NA")

In [22]:
# (statement_df["Cleaned Text"]=="#NA").sum()

In [23]:
# statement_df["Cleaned Text"] = statement_df["Cleaned Text"].apply(lambda x: x if min_words <= len(x.split()) < max_words else None) 

In [24]:
#  len(statement_df["Cleaned Text"].loc[x.iloc[-largest-1:-largest].index[0]].split()), statement_df["Text"].loc[x.iloc[-largest-1:-largest].index[0]]

In [25]:
# subset_data = data[data["final_statement_cleaned"]!="#NA"][["URL", "final_statement_cleaned"]].copy()
# subset_data.drop_duplicates(subset="URL", inplace=True)
# subset_data.head(6)