In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pathlib import Path
import spacy
import re
RSEED = 182026

In [2]:
link = Path(__name__).resolve().parent.parent
df = pd.read_csv(link / 'data/risk_paragraph_labeled.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cik         423 non-null    int64 
 1   paragraph   423 non-null    object
 2   word_count  423 non-null    int64 
 3   year        423 non-null    int64 
 4   labels      423 non-null    object
dtypes: int64(3), object(2)
memory usage: 16.6+ KB


In [3]:
df['year'].value_counts()

year
2022    255
2023    168
Name: count, dtype: int64

In [4]:
#some text contains "•" and other characters that should be removed
contains_dot_df = df[df['paragraph'].str.contains("•")]
len(contains_dot_df)

168

In [5]:
# checking example of sentences that has "•"
nlp = spacy.load('en_core_web_sm')
doc = nlp(contains_dot_df.iloc[0,1])
for sent in doc.sents:
    print(f"\n{sent}")


Uncertain global macro-economic and political conditions could materially adversely affect our results of operations and financial condition.

Uncertain global macro-economic and political conditions that affect the economy and the economic outlook of the United States, Europe, Asia and other parts of the world could materially adversely affect our results of operations and financial condition.

These uncertainties, include, among other things: • election results; • changes to laws and policies governing foreign trade (including, without limitation, the United States-Mexico-Canada Agreement (USMCA), the EU-UK Trade and Cooperation Agreement of December 2020 and other international trade agreements); • greater restrictions on imports and exports; • supply chain disruptions; • changes in laws and policies governing health care or data privacy; • tariffs and sanctions; • changes to the relationship between the United States and China; • sovereign debt levels; • the inability of political

In [6]:
#function to take care of such characters
def clean_text(text: str) -> str:
    text = re.sub(r'[•+▪■●◦·]', '', text)
    text = re.sub(r'(?m)^[\-\*]\s*', '', text)  # leading - or *
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

#testing function
cleaned_text = clean_text(contains_dot_df.iloc[0,1])
for sent in nlp(cleaned_text).sents:
    print(f"\n{sent}")


Uncertain global macro-economic and political conditions could materially adversely affect our results of operations and financial condition.

Uncertain global macro-economic and political conditions that affect the economy and the economic outlook of the United States, Europe, Asia and other parts of the world could materially adversely affect our results of operations and financial condition.

These uncertainties, include, among other things: election results; changes to laws and policies governing foreign trade (including, without limitation, the United States-Mexico-Canada Agreement (USMCA), the EU-UK Trade and Cooperation Agreement of December 2020 and other international trade agreements); greater restrictions on imports and exports; supply chain disruptions; changes in laws and policies governing health care or data privacy; tariffs and sanctions; changes to the relationship between the United States and China; sovereign debt levels; the inability of political institutions to e

In [7]:
#use clean_text function on paragraph column
df['paragraph'] = df['paragraph'].apply(clean_text)

#check if column still contains such characters
df[df['paragraph'].str.contains("•")]

Unnamed: 0,cik,paragraph,word_count,year,labels


In [8]:
df.head()

Unnamed: 0,cik,paragraph,word_count,year,labels
0,1000228,The health care products distribution industry...,1037,2022,"Market Risk,Credit Risk,Operational Risk,Legal..."
1,1000228,Uncertain global macro-economic and political ...,423,2022,"Market Risk,Credit Risk,Operational Risk,Liqui..."
2,1000228,Security risks generally associated with our i...,1240,2022,"Market Risk,Credit Risk,Operational Risk,Legal..."
3,1000697,The Company’s international operations may be ...,592,2022,"Market Risk,Credit Risk,Operational Risk,Liqui..."
4,1000697,We may not be able to attract and retain quali...,273,2022,"Market Risk,Operational Risk,Legal/Regulatory ..."


In [9]:
#shape of df
df.shape

(423, 5)

In [10]:
#checking duplicates
len(df.drop_duplicates('paragraph'))

# no duplicates found yet. 423 is the same number of rows in df

423

In [11]:
#sorting columns to check if similar rows appear
df.sort_values(['labels',"cik","paragraph"])

#index 36 and 52 below are identical
#lets investigate

Unnamed: 0,cik,paragraph,word_count,year,labels
265,5272,The following is a summary of the material ris...,40,2022,Ambiguous
316,732717,In connection with the separation of the Warne...,281,2022,Ambiguous
345,899051,We use internally developed and third-party ve...,40,2022,Ambiguous
71,1031296,"When we make distributions to shareholders, we...",265,2022,Ambiguous
87,1035443,An investment in our securities involves vario...,32,2022,Ambiguous
...,...,...,...,...,...
249,318154,Our products face substantial competition and ...,86,2022,Strategic Risk
254,318154,Our products face substantial competition and ...,79,2023,Strategic Risk
36,1018724,In addition to risks described elsewhere in th...,213,2022,Strategic Risk
52,1018724,In addition to risks described elsewhere in th...,213,2023,Strategic Risk


In [12]:
possible_duplicated_paragraphs = df[df['cik']==1018724].sort_values(['labels',"cik","paragraph"])
possible_duplicated_paragraphs.head(10)
#it seems that the same/similar risk paragraphs from previous year (2022) were used for 2023

Unnamed: 0,cik,paragraph,word_count,year,labels
53,1018724,"We accept payments using a variety of methods,...",449,2023,"Credit Risk,Operational Risk,Legal/Regulatory ..."
37,1018724,"We accept payments using a variety of methods,...",447,2022,"Credit Risk,Operational Risk,Legal/Regulatory ..."
42,1018724,"Our contracts with U.S., as well as state, loc...",102,2022,"Liquidity Risk,Legal/Regulatory Risk"
55,1018724,Some of the products we sell or manufacture ex...,185,2023,Market Risk
40,1018724,Some of the products we sell or manufacture ex...,143,2022,Market Risk
45,1018724,Demand for our products and services can fluct...,382,2023,"Market Risk,Credit Risk,Liquidity Risk"
35,1018724,"We have significant suppliers, including conte...",978,2022,"Market Risk,Credit Risk,Operational Risk,Legal..."
51,1018724,"We have significant suppliers, including conte...",987,2023,"Market Risk,Credit Risk,Operational Risk,Legal..."
44,1018724,Our international activities are significant t...,754,2023,"Market Risk,Credit Risk,Operational Risk,Liqui..."
24,1018724,Our international activities are significant t...,752,2022,"Market Risk,Credit Risk,Operational Risk,Liqui..."


In [13]:
#showing 2023 and 2022 paragraphs for cik-1018724

for loc in [0,1]:
    for sent in nlp(possible_duplicated_paragraphs.iloc[loc,1]).sents:
        print(f"{sent}")
    print(50*"*")
    print(50*"*")

We accept payments using a variety of methods, including credit card, debit card, credit accounts (including promotional financing), gift cards, direct debit from a customer’s bank account, consumer invoicing, physical bank check, and payment upon delivery.
For existing and future payment options we offer to our customers, we currently are subject to, and may become subject to additional, regulations and compliance requirements (including obligations to implement enhanced authentication processes that could result in significant costs and reduce the ease of use of our payments products), as well as fraud.
For certain payment methods, including credit and debit cards, we pay interchange and other fees, which may increase over time and raise our operating costs and lower profitability.
We rely on third parties to provide certain Amazon-branded payment methods and payment processing services, including the processing of credit cards, debit cards, electronic checks, and promotional financi

In [14]:
# due to possible duplicates in the paragraph columns and this project being a "prototype" I will use 
# only 2022 paragraphs

In [15]:
#use 2022 year
df_copy = df[df['year']==2022].copy()
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255 entries, 0 to 417
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cik         255 non-null    int64 
 1   paragraph   255 non-null    object
 2   word_count  255 non-null    int64 
 3   year        255 non-null    int64 
 4   labels      255 non-null    object
dtypes: int64(3), object(2)
memory usage: 12.0+ KB


In [16]:
df_copy.head()

Unnamed: 0,cik,paragraph,word_count,year,labels
0,1000228,The health care products distribution industry...,1037,2022,"Market Risk,Credit Risk,Operational Risk,Legal..."
1,1000228,Uncertain global macro-economic and political ...,423,2022,"Market Risk,Credit Risk,Operational Risk,Liqui..."
2,1000228,Security risks generally associated with our i...,1240,2022,"Market Risk,Credit Risk,Operational Risk,Legal..."
3,1000697,The Company’s international operations may be ...,592,2022,"Market Risk,Credit Risk,Operational Risk,Liqui..."
4,1000697,We may not be able to attract and retain quali...,273,2022,"Market Risk,Operational Risk,Legal/Regulatory ..."


In [17]:
#function that removes stop words and characters that wont add value to model
def extra_clean(text:str):
    whole_text = []
    doc = nlp(text)
    for token in doc:
        if token.like_num or token.is_punct or token.is_digit or token.is_currency or token.is_space or token.is_stop:
            continue
        lemma = token.lemma_.lower() #using the lemma of the word
        whole_text.append(lemma)
    return " ".join(whole_text)



In [18]:
print(f"original paragraph: {df_copy.iloc[-2]['paragraph']}")

print(f"Cleaned paragraph: {extra_clean(df_copy.iloc[-2]['paragraph'])}")

# words like "the", "some", "of" etc have been removed to 
# ensure the reduction of noise for better performance

original paragraph: Some of the provisions of our bye-laws and our shareholders agreement may have the effect of hindering, delaying or preventing third party takeovers or changes in management initiated by shareholders. These provisions may also prevent our shareholders from receiving premium prices for their shares in an unsolicited takeover. There are regulatory limitations on the ownership and transfer of our common shares. Arch Capital is a holding company and is dependent on dividends and other distributions from its operating subsidiaries. General market conditions and unpredictable factors could adversely affect market prices for our outstanding preferred shares. Dividends on our preferred shares are non-cumulative. Our preferred shares are equity and are subordinate to our existing and future indebtedness. The voting rights of holders of our preferred shares are limited.
Cleaned paragraph: provision bye law shareholder agreement effect hindering delay prevent party takeover ch

In [19]:
#apply function to whole paragraph column
df_copy['paragraph_cleaned'] = df_copy['paragraph'].apply(extra_clean)
df_copy.head()

Unnamed: 0,cik,paragraph,word_count,year,labels,paragraph_cleaned
0,1000228,The health care products distribution industry...,1037,2022,"Market Risk,Credit Risk,Operational Risk,Legal...",health care product distribution industry high...
1,1000228,Uncertain global macro-economic and political ...,423,2022,"Market Risk,Credit Risk,Operational Risk,Liqui...",uncertain global macro economic political cond...
2,1000228,Security risks generally associated with our i...,1240,2022,"Market Risk,Credit Risk,Operational Risk,Legal...",security risk generally associate information ...
3,1000697,The Company’s international operations may be ...,592,2022,"Market Risk,Credit Risk,Operational Risk,Liqui...",company international operation negatively aff...
4,1000697,We may not be able to attract and retain quali...,273,2022,"Market Risk,Operational Risk,Legal/Regulatory ...",able attract retain qualified employee future ...


In [20]:
#save dataset to use.
df_copy.to_csv(link / "data/risk_data.csv", index=False)

In [21]:
#since the labelling is a multi-label classification I could use a multilabelbinarizer
#to transform the label column.

#this requires putting labels in a list
df_copy['label_list'] = df_copy['labels'].apply(lambda x: [l.strip() for l in x.split(',') if l.strip()])

In [22]:
df_copy.head(3)

Unnamed: 0,cik,paragraph,word_count,year,labels,paragraph_cleaned,label_list
0,1000228,The health care products distribution industry...,1037,2022,"Market Risk,Credit Risk,Operational Risk,Legal...",health care product distribution industry high...,"[Market Risk, Credit Risk, Operational Risk, L..."
1,1000228,Uncertain global macro-economic and political ...,423,2022,"Market Risk,Credit Risk,Operational Risk,Liqui...",uncertain global macro economic political cond...,"[Market Risk, Credit Risk, Operational Risk, L..."
2,1000228,Security risks generally associated with our i...,1240,2022,"Market Risk,Credit Risk,Operational Risk,Legal...",security risk generally associate information ...,"[Market Risk, Credit Risk, Operational Risk, L..."


In [23]:
df_copy["label_list"].value_counts()

label_list
[Market Risk, Credit Risk, Operational Risk, Liquidity Risk, Legal/Regulatory Risk, Strategic Risk]                       18
[Market Risk, Credit Risk, Operational Risk, Liquidity Risk, Legal/Regulatory Risk]                                       17
[Market Risk, Operational Risk, Legal/Regulatory Risk, Strategic Risk]                                                    14
[Ambiguous]                                                                                                               12
[Market Risk, Operational Risk, Legal/Regulatory Risk, Strategic Risk, Reputational Risk]                                 10
[Market Risk, Operational Risk, Liquidity Risk, Legal/Regulatory Risk, Strategic Risk]                                    10
[Market Risk, Operational Risk, Legal/Regulatory Risk]                                                                    10
[Market Risk, Credit Risk, Operational Risk, Liquidity Risk, Legal/Regulatory Risk, Strategic Risk, Reputational R

In [24]:
#removing Ambiguous labels and rows without labels (if there are any)
df_copy = df_copy[df_copy['label_list'].apply(lambda x: len(x) > 0 and 'Ambiguous' not in x)]

In [25]:
# input and label data
X = df_copy['paragraph_cleaned'].values
y_list = df_copy['label_list'].tolist()

In [26]:
#applying multilabelbinarizer on the label
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_list)

#classes for Ml model
mlb.classes_

array(['Credit Risk', 'Legal/Regulatory Risk', 'Liquidity Risk',
       'Market Risk', 'Operational Risk', 'Reputational Risk',
       'Strategic Risk'], dtype=object)

In [27]:
#vectorizer
vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(1, 2), min_df=2,)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

#transform paragraphs
X_train_vec = vectorizer.fit_transform(X_train)

In [28]:
# Train one model per class. 
# example: train whether a paragraph is a credit risk or not. same analogy for the rest
#each model will be stored in a dictionary.
all_models = {}
for i, class_name in enumerate(mlb.classes_):
    print(f"Training {class_name}...")
    model = LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=1.0
    )
    model.fit(X_train_vec, y_train[:, i])
    all_models[class_name] = model

Training Credit Risk...
Training Legal/Regulatory Risk...
Training Liquidity Risk...
Training Market Risk...
Training Operational Risk...
Training Reputational Risk...
Training Strategic Risk...


In [29]:
all_models

{'Credit Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Legal/Regulatory Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Liquidity Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Market Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Operational Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Reputational Risk': LogisticRegression(class_weight='balanced', max_iter=1000),
 'Strategic Risk': LogisticRegression(class_weight='balanced', max_iter=1000)}

In [30]:
#predicting X_train
y_pred = np.zeros((len(X_train), len(mlb.classes_)))
for i, class_name in enumerate(mlb.classes_):
    probs = all_models[class_name].predict_proba(X_train_vec)[:, 1]
    y_pred[:, i] = (probs >= 0.5).astype(int)

In [31]:
# Overall metrics
f1_micro = f1_score(y_train, y_pred, average='micro')
f1_macro = f1_score(y_train, y_pred, average='macro')
f1_samples = f1_score(y_train, y_pred, average='samples')
print(f"\nOverall Training Metrics:")
print(f"F1 (micro): {f1_micro:.3f}")
print(f"F1 (macro): {f1_macro:.3f}")
print(f"F1 (samples): {f1_samples:.3f}")


Overall Training Metrics:
F1 (micro): 0.974
F1 (macro): 0.968
F1 (samples): 0.966


In [32]:
#predicting X_test
X_test_vec = vectorizer.transform(X_test)
y_pred_test = np.zeros((len(X_test), len(mlb.classes_)))
for i, class_name in enumerate(mlb.classes_):
    probs = all_models[class_name].predict_proba(X_test_vec)[:, 1]
    y_pred_test[:, i] = (probs >= 0.5).astype(int)


# Overall metrics
f1_micro = f1_score(y_test, y_pred_test, average='micro')
f1_macro = f1_score(y_test, y_pred_test, average='macro')
f1_samples = f1_score(y_test, y_pred_test, average='samples')
print(f"\nOverall Test Metrics:")
print(f"F1 (micro): {f1_micro:.3f}")
print(f"F1 (macro): {f1_macro:.3f}")
print(f"F1 (samples): {f1_samples:.3f}")


Overall Test Metrics:
F1 (micro): 0.823
F1 (macro): 0.800
F1 (samples): 0.761


In [33]:
#with the ideas shown here a script will be devloped for Training the models
# Logistic regression and xgboost will follow similiar logic

In [34]:
X.shape

(243,)