In [None]:
import numpy as np
import pandas as pd 
import fasttext
import re
from transformers import pipeline
import time
# to install Yfinance if needed
# !pip -q install yfinance requests rapidfuzz
import yfinance as yf
from rapidfuzz import fuzz


The dataset was initially preprocessed using Mixtral 8×7B, a large language model developed by [Mistral AI](https://arxiv.org/abs/2401.04088). Mixtral 8×7B is based on a Mixture-of-Experts architecture composed of eight expert subnetworks of 7 billion parameters each. For each input token, only two experts are dynamically selected, allowing the model to achieve strong performance while keeping inference costs low. This model was used to generate structured summaries from raw textual data.

In [2]:
data=pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,Date,Subject,Content,ParaphrasedSubject,CompactedSummary,DetailedSummary,Impact
0,3-Mar-24,BAWAN,Bawan Co. announces the board of director’s de...,Bawan Co. Declares Cash Dividends for Second H...,Bawan Co. announces the distribution of cash d...,Bawan Co. has announced its board of directors...,Shareholders who meet the eligibility criteria...
1,3-Mar-24,SABIC AGRI-NUTRIENTS,Addendum Announcement from SABIC Agri-Nutrient...,SABIC Agri-Nutrients Company Extends MoU with ...,SABIC Agri-Nutrients Company and Saudi Agricu...,SABIC Agri-Nutrients Company has announced an...,The extension of the MoU between SABIC Agri-N...
2,3-Mar-24,GAS,Gas Arabian Services Co. Announces Contract Si...,GAS Arabian Services Co. Inks Contract with Sa...,GAS Arabian Services Co. has signed a contrac...,GAS Arabian Services Company has announced the...,This contract marks a significant milestone fo...
3,3-Mar-24,GAS,Gas Arabian Services Co. Announces Contract Si...,GAS Arabian Services Co. Inks Contract with Sa...,GAS Arabian Services Co. has signed a contrac...,GAS Arabian Services Company has announced the...,This contract marks a significant milestone fo...
4,3-Mar-24,ADVANCED,ADVANCED PETROCHEMICAL COMPANY ANNOUNCES THE L...,Advanced Petrochemical Company Resumes Propyle...,Advanced Petrochemical Company has resumed op...,Advanced Petrochemical Company has announced t...,The resumption of operations at Advanced Petro...


In [None]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True, errors='coerce')
# data['Date'] = pd.to_datetime(data['Date'], format='%d-%b-%y' )

  data['Date'] = pd.to_datetime(data['Date'], dayfirst=True, errors='coerce')


Teacher model (Mixtral) produced summaries

Student model learns to mimic them (often smaller/faster)

Then you can improve beyond the teacher by:

- filtering low-quality labels
- adding factuality constraints
- adding domain-specific evaluation (numbers/entities)


## DATASET AUDIT

---

### 1 Dataset shape & columns

In [7]:
print("Shape:", data.shape)
print("\nColumns:")
for col in data.columns:
    print("-", col)


Shape: (1839, 7)

Columns:
- Date
- Subject
- Content
- ParaphrasedSubject
- CompactedSummary
- DetailedSummary
- Impact


### 2 Missing values & empty strings

In this analysis we are checking for possible "red flags" for the LLM-generated labels (Mixtral 8x7B), more specifically, we perform a sanity check to see if there is : 
- Missing Impact or Summary
- Empty Content rows
- Any column with >5% missing

In [8]:
missing = (
    data.isna().sum()
    .to_frame("NaN_count")
    .assign(
        empty_string_count=lambda df: [
            (data[col].astype(str).str.strip() == "").sum()
            for col in data.columns
        ]
    )
)

missing["total_missing"] = missing["NaN_count"] + missing["empty_string_count"]
missing["percent_missing"] = (
    missing["total_missing"] / data.shape[0] * 100
)
missing


Unnamed: 0,NaN_count,empty_string_count,total_missing,percent_missing
Date,1,0,1,0.054377
Subject,0,0,0,0.0
Content,0,1,1,0.054377
ParaphrasedSubject,0,0,0,0.0
CompactedSummary,28,0,28,1.522567
DetailedSummary,43,0,43,2.338227
Impact,44,0,44,2.392605


### 3 Date audit

In [9]:
# Range and distribution of dates 

print("Date range:")
print(data["Date"].min(), "→", data["Date"].max())

print("\nTop dates by frequency:")
data["Date"].value_counts().head()


#  min date is 2024 ??

Date range:
2024-03-03 00:00:00 → 2024-06-10 00:00:00

Top dates by frequency:


Date
2024-06-09    59
2024-06-05    55
2024-05-16    53
2024-05-23    53
2024-05-13    51
Name: count, dtype: int64

In [10]:
# Missing / invalid dates

data["Date"].isna().sum()

# I believe these NaN correspond 


np.int64(1)

In [11]:
# Articles per day (important for leakage later)

articles_per_day = data.groupby("Date").size()
articles_per_day.describe()


count    62.000000
mean     29.645161
std      15.711640
min       1.000000
25%      21.000000
50%      31.500000
75%      41.750000
max      59.000000
dtype: float64

### 4 Text length statistics


In [12]:
def word_count(s):
    return len(str(s).split())

text_cols = [
    "Subject",
    "Content",
    "CompactedSummary",
    "DetailedSummary",
    "Impact",
]

length_stats = {}

for col in text_cols:
    length_stats[col] = data[col].apply(word_count)

length_df = pd.DataFrame(length_stats)
length_df.describe(percentiles=[.1, .25, .5, .75, .9, .95])


Unnamed: 0,Subject,Content,CompactedSummary,DetailedSummary,Impact
count,1839.0,1839.0,1839.0,1839.0,1839.0
mean,5.383361,330.088091,32.311039,121.947254,48.319195
std,7.717705,317.096013,13.241173,64.625639,14.347689
min,1.0,0.0,1.0,1.0,1.0
10%,1.0,22.8,18.0,48.0,34.0
25%,1.0,121.0,24.0,74.0,40.0
50%,2.0,221.0,30.0,115.0,48.0
75%,7.0,447.5,39.0,159.0,56.0
90%,19.0,788.2,49.0,205.0,65.0
95%,23.0,985.1,57.0,235.0,71.0


### 5 Extreme outliers (long / short)


In [13]:
# Very short or empty content
data.loc[length_df["Content"] < 50, ["Date", "Subject", "Content"]].head(5)


Unnamed: 0,Date,Subject,Content
50,2024-03-05,AMERICANA,Americana Restaurants International PLC Announ...
70,2024-03-05,The Securities Depository Center Company (Edaa...,The Securities Depository Center Company (Edaa...
91,2024-03-06,NBM,"No English translation, kindly refer to the Ar..."
97,2024-03-07,The Securities Depository Center Company (Edaa...,The Securities Depository Center Company (Edaa...
157,2024-03-11,The Securities Depository Center (Edaa) Announ...,The Securities Depository Center Company (Edaa...


In [14]:
# Extremely long content (PDF dumps?)

data.loc[length_df["Content"] > 3000, ["Date", "Subject"]].head(5)


Unnamed: 0,Date,Subject


### 6 Duplicate detection

In [15]:
# 6.1 Exact duplicate contents
print(f'We have {data.duplicated(subset=["Content"]).sum()} duplicate content entries.')
data[data.duplicated(subset=["Content"], keep=False)][
    ["Date", "Subject"]
].head(10)


We have 178 duplicate content entries.


Unnamed: 0,Date,Subject
91,2024-03-06,NBM
159,2024-03-11,NBM
160,2024-03-11,NBM
238,2024-03-22,Resume trading on SHUAA shares after disclosin...
239,2024-03-22,Resume trading on NIH shares after disclosing ...
240,2024-03-22,"Reminder: Today, 22/03/2024 is the ex-dividend..."
270,2024-03-25,Suspend trading on ASNIC shares starting from ...
271,2024-03-25,Suspend trading on ITHMR shares starting from ...
287,2024-03-26,Resume trading on ITHMR shares after disclosin...
288,2024-03-26,Resume trading on ASNIC shares after disclosin...


In [16]:
# Same subject + same date
# data.duplicated(subset=["Date", "Subject"]).sum()


### 7 Summary leakage check (extractiveness)
We want to know if summaries are copy-pasted from content.

Interpretation:

~0.4–0.6 $\rightarrow$ mixed abstractive

0.8 $\rightarrow$ mostly extractive

very low $\rightarrow$ possible hallucination

In [17]:
def overlap_ratio(text, summary):
    text_words = set(str(text).lower().split())
    summary_words = str(summary).lower().split()
    if len(summary_words) == 0:
        return 0
    overlap = sum(w in text_words for w in summary_words)
    return overlap / len(summary_words)

data["compact_overlap"] = data.apply(
    lambda r: overlap_ratio(r["Content"], r["CompactedSummary"]),
    axis=1
)

data["detailed_overlap"] = data.apply(
    lambda r: overlap_ratio(r["Content"], r["DetailedSummary"]),
    axis=1
)

data[["compact_overlap", "detailed_overlap"]].describe()


Unnamed: 0,compact_overlap,detailed_overlap
count,1839.0,1839.0
mean,0.627678,0.636109
std,0.244103,0.251458
min,0.0,0.0
25%,0.6,0.61808
50%,0.7,0.716495
75%,0.769231,0.783328
max,1.0,0.964476


### 8 Language check (quick heuristic)

If close to 1 → English-dominated

If mixed → we’ll need language filtering.

In [46]:
data["Content"].str.contains(r"\b(the|and|is|with|for)\b", case=False).mean()


  data["Content"].str.contains(r"\b(the|and|is|with|for)\b", case=False).mean()


np.float64(0.9974635383639823)

#### 8.1 Language cleaning 

In [21]:
model = fasttext.load_model("lid.176.bin")

def is_english(text):
    if not isinstance(text, str) or not text.strip():
        return False

    try:
        labels, probs = model.predict(
            text.replace("\n", " "),
            k=1
        )
        return labels[0] == "__label__en" and probs[0] > 0.8
    except ValueError:
        return False

texts = (
    data["Content"]
    .fillna("")
    .astype(str)
    .str.replace("\n", " ", regex=False)
    .tolist()
)

labels, probs = model.predict(texts, k=1)

mask = [
    lbl[0] == "__label__en" and pr[0] > 0.8
    for lbl, pr in zip(labels, probs)
]

data = data.loc[mask].reset_index(drop=True)
data["Content"].str.contains(r"\b(the|and|is|with|for)\b", case=False).mean()

  data["Content"].str.contains(r"\b(the|and|is|with|for)\b", case=False).mean()


np.float64(0.9974635383639823)

In [22]:
print(f"After filtering non-English content, we have {data.shape[0]} rows left.")

After filtering non-English content, we have 1577 rows left.


### 10 Final audit snapshot

In [23]:
audit_summary = {
    "num_rows": len(data),
    "date_min": data["Date"].min(),
    "date_max": data["Date"].max(),
    "avg_content_words": length_df["Content"].mean(),
    "avg_compact_words": length_df["CompactedSummary"].mean(),
    "avg_detailed_words": length_df["DetailedSummary"].mean(),
    "avg_impact_words": length_df["Impact"].mean(),
    "duplicate_contents": data.duplicated(subset=["Content"]).sum(),
}

pd.Series(audit_summary)


num_rows                             1577
date_min              2024-03-03 00:00:00
date_max              2024-06-10 00:00:00
avg_content_words              330.088091
avg_compact_words               32.311039
avg_detailed_words             121.947254
avg_impact_words                48.319195
duplicate_contents                      9
dtype: object

---

# Finance NER and Entity linking

### Structure : 
1. Named Entity Recognition
2. Entity linking (Yfinance)
3. Pair creation

<hr style="width:200px; margin-left:0;">

## F.1 Named Entity Recognition and ticker extraction 


In [25]:

# Ensure these exist
assert "Content" in data.columns

# Include Subject for better company detection (often contains issuer)
use_subject = "Subject" in data.columns

data["text_for_ner"] = (
    (data["Subject"].fillna("").astype(str) + "\n") if use_subject else ""
) + data["Content"].fillna("").astype(str)

# Light cleanup to reduce weird whitespace
data["text_for_ner"] = data["text_for_ner"].str.replace(r"\s+", " ", regex=True).str.strip()


# Solution to the appearence of non ORG entities in the filings (e.g. "board of directors", "general assembly", "audit committee", "capital market authority", etc.)
# Phrases that are NOT issuers (very common in filings)
BAD_ORG_PHRASES = {
    "board of directors", "board", "directors", "of directors",
    "general assembly", "ordinary general assembly", "extraordinary general assembly", "assembly",
    "audit committee", "committee", "screening committee",
    "capital market authority", "cma", "saudi exchange", "tadawul",
    "securities depository center", "securities depository center company", "edaa",
    "investor relations", "investor relations department", "department",
    "company", "the company", "additional information", "distribution date",
}

# Quick tokens that are almost always junk
BAD_TOKENS = {"b", "r", "s", "al", "el", "no", "sa", "tad", "gf", "ema", "trans", "element", "extra"}


In [26]:
# A classic pretrained NER model (general-domain but solid)
ner = pipeline(
    "token-classification",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)
# Use truncation to avoid super long docs slowing you down
MAX_CHARS = 4000
data["text_for_ner_trunc"] = data["text_for_ner"].str.slice(0, MAX_CHARS)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: dslim/bert-base-NER
Key                      | Status     |  | 
-------------------------+------------+--+-
bert.pooler.dense.weight | UNEXPECTED |  | 
bert.pooler.dense.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [27]:
def extract_ner_entities(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    try:
        ents = ner(text)
    except Exception:
        return []
    return [{"text": e["word"], "label": e["entity_group"]} for e in ents if e.get("word")]

def clean_entity_text(s: str) -> str:
    s = str(s)

    # Remove BERT WordPiece artifacts: "##IPCO" -> "IPCO"
    s = s.replace("##", "")

    # Normalize whitespace
    s = re.sub(r"\s+", " ", s).strip()

    # Strip punctuation
    s = s.strip(" \t\n\r.,;:()[]{}\"'“”")

    return s

def is_good_issuer_candidate(name: str) -> bool:
    if not name:
        return False

    n = name.lower().strip()

    # remove obvious boilerplate phrases
    if n in BAD_ORG_PHRASES:
        return False

    # too short or junk tokens
    if len(n) <= 2:
        return False
    if n in BAD_TOKENS:
        return False

    # looks like a fragment (single word that’s too generic)
    generic_words = {"board", "assembly", "committee", "company", "authority", "exchange", "depository", "department"}
    if n in generic_words:
        return False

    # drop if mostly non-letters
    letters = sum(ch.isalpha() for ch in name)
    if letters < max(2, int(0.4 * len(name))):
        return False

    return True

def extract_company_candidates(entities):
    # keep ORG only (simple)
    orgs = [e["text"] for e in entities if e["label"] == "ORG"]
    cleaned = [clean_entity_text(o) for o in orgs]
    cleaned = [o for o in cleaned if is_good_issuer_candidate(o)]

    # dedupe case-insensitive
    out, seen = [], set()
    for o in cleaned:
        key = o.lower()
        if key not in seen:
            seen.add(key)
            out.append(o)
    return out

data["entities_all"] = data["text_for_ner_trunc"].apply(extract_ner_entities)
data["issuer_candidates"] = data["entities_all"].apply(extract_company_candidates)

data[["Subject" if use_subject else "text_for_ner_trunc", "issuer_candidates"]].head(5)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Subject,issuer_candidates
0,BAWAN,"[BAWAN Bawan Co, Bawan Co]"
1,SABIC AGRI-NUTRIENTS,"[SABIC, SABIC Agri - Nutrients Company, Saudi ..."
2,ADVANCED,[Advanced Petrochemical Company]
3,ALASEEL,"[ALASEE, Thobe Al - Aseel, Company El, Thobe A..."
4,GROUP FIVE,"[Five Pipe Saudi Co, Audi]"


In [28]:
def pick_primary_issuer(subject, candidates):
    if not candidates:
        return None
    subj = subject.lower() if isinstance(subject, str) else ""
    
    # Prefer candidates that appear in the subject
    for c in candidates:
        if c.lower() in subj:
            return c
    
    # Otherwise take the first candidate (already cleaned)
    return candidates[0]

data["primary_issuer"] = data.apply(
    lambda r: pick_primary_issuer(r["Subject"] if use_subject else "", r["issuer_candidates"]),
    axis=1
)

data[["Subject" if use_subject else "text_for_ner_trunc", "issuer_candidates", "primary_issuer"]].head(5)


Unnamed: 0,Subject,issuer_candidates,primary_issuer
0,BAWAN,"[BAWAN Bawan Co, Bawan Co]",BAWAN Bawan Co
1,SABIC AGRI-NUTRIENTS,"[SABIC, SABIC Agri - Nutrients Company, Saudi ...",SABIC
2,ADVANCED,[Advanced Petrochemical Company],Advanced Petrochemical Company
3,ALASEEL,"[ALASEE, Thobe Al - Aseel, Company El, Thobe A...",ALASEE
4,GROUP FIVE,"[Five Pipe Saudi Co, Audi]",Five Pipe Saudi Co


In [29]:
print("Coverage (primary issuer not null):", data["primary_issuer"].notna().mean())
data[["Subject","primary_issuer","issuer_candidates"]].sample(5, random_state=42)


Coverage (primary issuer not null): 0.9923906150919467


Unnamed: 0,Subject,primary_issuer,issuer_candidates
1246,Opening the trading limit of INOVEST B.S.C.,Bahrain Bourse,[Bahrain Bourse]
813,FIPCO,IPCO,"[IPCO Filling and Packing Materials MFG Co, IPCO]"
1173,2P,2P Perfect Presentation for Commercial Service...,[2P Perfect Presentation for Commercial Servic...
534,DFM Regulated Short Sell – Weekly Summary - April,DFM,"[DFM, GFH Financial Group B, Emaar Properties,..."
514,ENMA ALRAWABI,ENMA ALRAWABI Enma Alrawabi Company,"[ENMA ALRAWABI Enma Alrawabi Company, Enma Alr..."



<hr style="width:200px; margin-left:0;">

## Tickers - Entity Linking 

In [30]:
# We define a threshold for how many times an issuer must appear to be included in the mapping 
threshold = 3  # <-- This cqn be tuned (try 3,4,5,etc.) 
# With more than 3 len of issuers to map drops significantly, but you can experiment

issuer_freq = (
    data.dropna(subset=["primary_issuer"])
        .groupby("primary_issuer")
        .size()
        .sort_values(ascending=False)
)

issuers_to_map = issuer_freq[issuer_freq >= threshold].index.tolist()

print("Unique issuers:", issuer_freq.shape[0])
print(f"Issuers to map (freq >= {threshold}): {len(issuers_to_map)}")
issuer_freq.head(20)


Unique issuers: 878
Issuers to map (freq >= 3): 165


primary_issuer
SAU                                                   13
NBM                                                   10
Fransi Capital                                         9
MIS                                                    8
DFM                                                    8
Saudi Exchange Company                                 7
ALKHALEEJ                                              7
GAS Gas Arabian Services Co                            7
SAUDI CABLE Saudi Cable Company                        7
ALRAJHI Al Rajhi Bank                                  7
ALRAJHI TAKAF                                          6
ALMUNAJEM Almunajem Foods Co                           6
Budget Saudi                                           6
Saudi Aramco                                           6
CBUAE                                                  6
AZM                                                    6
Appeal Committee for Resolution of Securities Disp     6
ALHOKAIR GRO    

In [31]:
def issuer_query_string(primary_issuer: str) -> str:
    if not isinstance(primary_issuer, str):
        return ""
    s = re.sub(r"\s+", " ", primary_issuer).strip()

    parts = s.split()
    # If first token looks like a code (ALLCAPS or digits) then drop it
    if parts and (parts[0].isupper() or parts[0].isdigit()) and len(parts[0]) >= 2:
        s2 = " ".join(parts[1:]).strip()
        return s2 if len(s2) >= 3 else s

    return s

data["issuer_query"] = data["primary_issuer"].apply(issuer_query_string)
data[["primary_issuer","issuer_query"]].head(10)


Unnamed: 0,primary_issuer,issuer_query
0,BAWAN Bawan Co,Bawan Co
1,SABIC,SABIC
2,Advanced Petrochemical Company,Advanced Petrochemical Company
3,ALASEE,ALASEE
4,Five Pipe Saudi Co,Five Pipe Saudi Co
5,Five Pipe Saudi Co,Five Pipe Saudi Co
6,NBM,NBM
7,ZAIN KSA,KSA
8,KKR Saudi Limited Company,Saudi Limited Company
9,KKR Saudi Limited Company,Saudi Limited Company


In [32]:
def yf_search_quotes(query: str, max_results: int = 10):
    """
    Returns a list of quote dicts from yfinance Search.
    """
    try:
        s = yf.Search(query, max_results=max_results)
        return s.quotes or []
    except Exception:
        return []

def pick_best_yf_quote(issuer_query: str, quotes: list):
    """
    Pick best match using name similarity; prefer equities.
    """
    best = None
    for q in quotes:
        symbol = q.get("symbol")
        qtype  = q.get("quoteType", "")
        name   = q.get("longname") or q.get("shortname") or ""

        if not symbol:
            continue
        if qtype and qtype not in {"EQUITY", "ETF"}:
            continue

        score = 0
        if name:
            score = fuzz.token_set_ratio(issuer_query, name)

        cand = {"symbol": symbol, "name": name, "quoteType": qtype, "score": score}

        if best is None or cand["score"] > best["score"]:
            best = cand
    return best


In [None]:
def validate_symbol(symbol: str) -> bool:
    try:
        hist = yf.Ticker(symbol).history(period="5d")
        return hist is not None and len(hist) > 0
    except Exception:
        return False


In [34]:
issuers_to_map = issuer_freq[issuer_freq >= threshold].index.tolist()

issuer_to_ticker = {}
issuer_to_debug = {}

SLEEP = 0.35  # be gentle to avoid rate limits

def resolve_with_yfinance(primary_issuer: str):
    if primary_issuer in issuer_to_ticker:
        return issuer_to_ticker[primary_issuer]

    query = issuer_query_string(primary_issuer)
    if not query:
        issuer_to_ticker[primary_issuer] = None
        issuer_to_debug[primary_issuer] = {"reason": "empty_query"}
        return None

    quotes = yf_search_quotes(query, max_results=10)
    best = pick_best_yf_quote(query, quotes)

    if not best:
        issuer_to_ticker[primary_issuer] = None
        issuer_to_debug[primary_issuer] = {"reason": "no_quotes", "query": query}
        return None

    sym = best["symbol"]
    ok = validate_symbol(sym)

    issuer_to_ticker[primary_issuer] = sym if ok else None
    issuer_to_debug[primary_issuer] = {**best, "query": query, "validated": ok}

    time.sleep(SLEEP)
    return issuer_to_ticker[primary_issuer]

mapped = []
for issuer in issuers_to_map:
    mapped.append((issuer, resolve_with_yfinance(issuer), issuer_freq[issuer]))

mapping_df = pd.DataFrame(mapped, columns=["primary_issuer", "ticker", "freq"])
mapping_df.sort_values(["freq"], ascending=False).head(10)


Unnamed: 0,primary_issuer,ticker,freq
0,SAU,7010.SR,13
1,NBM,NBMFF,10
2,Fransi Capital,,9
3,MIS,MRX.DE,8
4,DFM,DFMTF,8
5,Saudi Exchange Company,,7
6,ALKHALEEJ,9631.SR,7
7,GAS Gas Arabian Services Co,4146.SR,7
8,SAUDI CABLE Saudi Cable Company,,7
9,ALRAJHI Al Rajhi Bank,1120.SR,7


In [35]:
failed = mapping_df[mapping_df["ticker"].isna()].sort_values("freq", ascending=False)
print("Failed mappings:", len(failed))
failed.head(10)


Failed mappings: 48


Unnamed: 0,primary_issuer,ticker,freq
2,Fransi Capital,,9
5,Saudi Exchange Company,,7
8,SAUDI CABLE Saudi Cable Company,,7
12,Budget Saudi,,6
14,CBUAE,,6
16,Appeal Committee for Resolution of Securities ...,,6
22,Scientific & Medical Equipment House Co,,5
23,Sipchem,,5
24,JABAL OMAR Jabal Omar Development Company,,5
27,SASCO,,5


In [36]:
print("Total issuers mapped:", len(mapping_df))
print("Mapped tickers:", mapping_df["ticker"].notna().sum())
print("Mapping success rate:", mapping_df["ticker"].notna().mean())


Total issuers mapped: 165
Mapped tickers: 117
Mapping success rate: 0.7090909090909091


In [37]:
data = data.merge(
    mapping_df[["primary_issuer", "ticker"]],
    on="primary_issuer",
    how="left"
)

print("Ticker column added.")
data[["primary_issuer", "ticker"]].head(10)


Ticker column added.


Unnamed: 0,primary_issuer,ticker
0,BAWAN Bawan Co,1302.SR
1,SABIC,2020.SR
2,Advanced Petrochemical Company,
3,ALASEE,
4,Five Pipe Saudi Co,9523.SR
5,Five Pipe Saudi Co,9523.SR
6,NBM,NBMFF
7,ZAIN KSA,
8,KKR Saudi Limited Company,
9,KKR Saudi Limited Company,


In [38]:
# Filter to rows with valid ticker

data_valid = data.dropna(subset=["ticker"]).copy()

print("Rows with valid ticker:", len(data_valid))
print("Unique tickers:", data_valid["ticker"].nunique())
data_valid = data.dropna(subset=["ticker"]).copy()
print("Date range:", data_valid["Date"].min(), "→", data_valid["Date"].max())



Rows with valid ticker: 466
Unique tickers: 110
Date range: 2024-03-03 00:00:00 → 2024-06-10 00:00:00


In [39]:
print("Total rows:", len(data))
print("Rows with ticker:", data["ticker"].notna().sum())
print("Mapping success rate:", data["ticker"].notna().mean())


Total rows: 1577
Rows with ticker: 466
Mapping success rate: 0.29549778059606846


<hr style="width:200px; margin-left:0;">

## F.2 Data filtering via ticker frequency 


In [40]:
ticker_freq = (
    data_valid.groupby("ticker")
    .size()
    .sort_values(ascending=False)
)

ticker_freq.head(20)


ticker
7010.SR            17
NBMFF              10
SUKOONTAKAFL.AE    10
DFMTF               8
MRX.DE              8
1120.SR             7
4146.SR             7
9631.SR             7
2283.SR             7
4162.SR             6
4020.SR             6
2223.SR             6
2020.SR             6
1140.SR             6
4100.SR             6
AZM.MI              6
PLUG                6
RWAYI               6
BA                  6
9568.SR             6
dtype: int64

In [41]:
min_events = 3
valid_tickers = ticker_freq[ticker_freq >= min_events].index

data_final = data_valid[data_valid["ticker"].isin(valid_tickers)].copy()

print("Rows after ticker frequency filter:", len(data_final))
print("Unique tickers:", data_final["ticker"].nunique())


Rows after ticker frequency filter: 466
Unique tickers: 110


In [42]:
drop_cols = [
    "compact_overlap", "detailed_overlap",
    "text_for_ner", "text_for_ner_trunc", "entities_all",
    "issuer_candidates", "issuer_query"
]

drop_cols = [c for c in drop_cols if c in data_final.columns]
data_final = data_final.drop(columns=drop_cols)

print(data_final.columns)
data_final.head(5)

Index(['Date', 'Subject', 'Content', 'ParaphrasedSubject', 'CompactedSummary',
       'DetailedSummary', 'Impact', 'primary_issuer', 'ticker'],
      dtype='object')


Unnamed: 0,Date,Subject,Content,ParaphrasedSubject,CompactedSummary,DetailedSummary,Impact,primary_issuer,ticker
0,2024-03-03,BAWAN,Bawan Co. announces the board of director’s de...,Bawan Co. Declares Cash Dividends for Second H...,Bawan Co. announces the distribution of cash d...,Bawan Co. has announced its board of directors...,Shareholders who meet the eligibility criteria...,BAWAN Bawan Co,1302.SR
1,2024-03-03,SABIC AGRI-NUTRIENTS,Addendum Announcement from SABIC Agri-Nutrient...,SABIC Agri-Nutrients Company Extends MoU with ...,SABIC Agri-Nutrients Company and Saudi Agricu...,SABIC Agri-Nutrients Company has announced an...,The extension of the MoU between SABIC Agri-N...,SABIC,2020.SR
4,2024-03-03,GROUP FIVE,Group Five Pipe Saudi Co. Announces Appointmen...,Group Five Pipe Saudi Co. Appoints Audit Commi...,Group Five Pipe Saudi Co. has appointed a new...,Group Five Pipe Saudi Co. has announced the ap...,The addition of a new audit committee member t...,Five Pipe Saudi Co,9523.SR
5,2024-03-03,GROUP FIVE,Group Five Pipe Saudi Co. Announces the Result...,Group Five Pipe Saudi Co. Reveals Outcomes of ...,Group Five Pipe Saudi Co. disclosed the resul...,Group Five Pipe Saudi Co. has published the ou...,The announcement impacts shareholders and stak...,Five Pipe Saudi Co,9523.SR
6,2024-03-03,NBM,"No English translation, kindly refer to the Ar...",NBM General Assembly Meeting Details,NBM held a General Assembly meeting on 2024-0...,The National Bank of Mesopotamia (NBM) conduc...,"This announcement is primarily informational,...",NBM,NBMFF


In [43]:
# Save the final cleaned dataset 
# data_final.to_csv("data_final_with_debug_columns.csv", index=False)


<hr style="width:200px; margin-left:0;">

## F.3 (date,ticker) pairs creation 

In [44]:
# We create pairs that would be used for the Yfinance integration

pairs = (
    data_final[["Date", "ticker"]]
        .drop_duplicates()
        .sort_values(["Date", "ticker"])
        .reset_index(drop=True)
)
pairs.head(20)

print("Original rows with ticker:", len(data_final))
print("Unique (Date, ticker) pairs:", len(pairs))



Original rows with ticker: 466
Unique (Date, ticker) pairs: 414


<hr style="width:200px; margin-left:0;">

## F.4 Train/test split 

In [45]:
pairs = pairs.sort_values("Date").reset_index(drop=True)

# Split by date quantile to avoid leakage (events from the same day are in the same set)
unique_dates = np.array(sorted(pairs["Date"].unique()))

cut_idx = int(0.8 * len(unique_dates))
cutoff_date = unique_dates[cut_idx]

train_pairs = pairs[pairs["Date"] < cutoff_date].copy()
test_pairs  = pairs[pairs["Date"] >= cutoff_date].copy()

print("Cutoff date:", cutoff_date)
print("Train events:", len(train_pairs))
print("Test events:", len(test_pairs))

print("Date range train:", train_pairs["Date"].min(), "→", train_pairs["Date"].max())
print("Date range test:", test_pairs["Date"].min(), "→", test_pairs["Date"].max())



Cutoff date: 2024-05-28 00:00:00
Train events: 350
Test events: 64
Date range train: 2024-03-03 00:00:00 → 2024-05-27 00:00:00
Date range test: 2024-05-28 00:00:00 → 2024-06-10 00:00:00
