# Download the Speeches Data from BIS

In [None]:
!pip install gingado



In [None]:
from gingado.datasets import load_CB_speeches

In [None]:
all_speeches = load_CB_speeches()

In [None]:
all_speeches.to_csv("central_bank_speeches.csv", index=False)

"""
Summarize BIS Central Bank Speeches by Country
----------------------------------------------

This script reads the full BIS speech dataset and generates a summary table
showing the number of speeches delivered by central bankers in each country.
"""

In [1]:
import os
import pandas as pd
from IPython.display import display

# Load the data
input_file = 'Data/speeches data_final.csv'
df = pd.read_csv(input_file)

# Generate country count table
country_counts = df['country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']

# Display the updated country count table
display(country_counts)

# Create Results folder if it doesn't exist
results_folder = 'Results'
os.makedirs(results_folder, exist_ok=True)

# Save the country count table as CSV
output_filename = os.path.join(results_folder, '1-country_count.csv')
country_counts.to_csv(output_filename, index=False)

# Print confirmation
print(f'File saved as: {output_filename}')


Unnamed: 0,Country,Count
0,US,2090
1,Germany,1052
2,European Central Bank,1035
3,India,964
4,France,793
...,...,...
110,Guatemala,1
111,Uruguay,1
112,Belize,1
113,Aruba,1


File saved as: Results\1-country_count.csv


"""
Inspect BIS Speech Data Structure and Missing Values
----------------------------------------------------

This script performs a quick integrity check on the BIS speech dataset by:
- Verifying the data type and structure of the 'text' column
- Displaying sample content from the speech text
- Counting missing values in the 'text' field to identify incomplete entries
"""

In [2]:
import pandas as pd

# Load the data
input_file = 'Data/speeches data_final.csv'
df = pd.read_csv(input_file)

# Check data structure of the 'text' column
print("Data Type of 'text' Column:", df['text'].dtype)
print("\nFirst 5 Rows of 'text' Column:")
print(df['text'].head())

# Check for missing values
missing_values = df['text'].isna().sum()
print(f"\nMissing values in 'text' column: {missing_values}")

# Display a sample row to analyze text structure
sample_index = 0  
print("\nSample text data:")
print(df['text'].iloc[sample_index])


Data Type of 'text' Column: object

First 5 Rows of 'text' Column:
0    Mr. Chen discusses monetary relations between ...
1    Mr. Dai looks at the possibilities of strength...
2    Mr. Dai assesses the outlook for Hong Kong as ...
3    Mr. Rangarajan examines the objectives of mone...
4    M. Trichet presents the monetary policy guidel...
Name: text, dtype: object

Missing values in 'text' column: 0

Sample text data:
Mr. Chen discusses monetary relations between China and Hong Kong
Speech by the Deputy Governor of the People's Bank of China, Mr. Chen Yuan, at the Bank of
England Seminar held in London on 10/9/96.
INTRODUCTION
In less than three hundred days, China will resume the exercise of sovereignty
over Hong Kong. Given Hong Kong's role as an international financial centre, it is natural for
the international investment community to be interested in the future of Hong Kong. I am
grateful to Governor George for giving me this opportunity to share with this distinguished
audience 

In [31]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

"""
Preprocess BIS Central Bank Speeches for Sentence-Level Analysis
----------------------------------------------------------------

This script prepares speech data for sentence-level CBDC classification by:
- Cleaning and lowercasing raw text
- Splitting speeches into well-formed sentences using NLTK's tokenizer
- Filtering out very short sentences (e.g., section headers)
- Saving the processed data for downstream analysis

Notes:
- Sentence format: lowercase, whitespace-normalized, filtered by length
- Designed for consistent preprocessing across all speeches prior to keyword matching or model training
"""


In [7]:
import os
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Ensure nltk sentence tokenizer is available
nltk.download('punkt')

# Define input and output file paths
input_file = os.path.join("Data", "speeches data_final.csv")
output_file = os.path.join("Data", "speeches_data_preprocessed.csv")

# Load the dataset
df = pd.read_csv(input_file)

# Function to clean and split text into sentences
def preprocess_text(text):
    if pd.isna(text):  # Check if text is NaN
        return []
    
    text = text.strip()  # Remove leading/trailing spaces
    text = text.replace("\n", " ")  # Remove newlines
    text = " ".join(text.split())  # Normalize extra spaces
    text = text.lower()  # Convert to lowercase for case-insensitive matching
    
    # Split text into sentences
    sentences = sent_tokenize(text)
    
    # Filter out single-word section headers (optional)
    sentences = [s for s in sentences if len(s.split()) > 3]
    
    return sentences

# Apply preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Save the preprocessed file
df.to_csv(output_file, encoding='utf-8', index=False)

# Display a sample of preprocessed sentences
print("Sample preprocessed sentences from first row:")
print(df['processed_text'].iloc[0])

print(f"\nPreprocessed dataset saved at: {output_file}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sample preprocessed sentences from first row:
["mr. chen discusses monetary relations between china and hong kong speech by the deputy governor of the people's bank of china, mr. chen yuan, at the bank of england seminar held in london on 10/9/96.", 'introduction in less than three hundred days, china will resume the exercise of sovereignty over hong kong.', "given hong kong's role as an international financial centre, it is natural for the international investment community to be interested in the future of hong kong.", 'i am grateful to governor george for giving me this opportunity to share with this distinguished audience the chinese government\'s policy on the monetary relationship between the mainland of china and hong kong after 1997. by way of introduction, let me first explain the origin of the basic principle of "one country, two systems", which forms the backbone of china\'s policy towards the hong kong special administrative region as from 1 july 1997. this will help people

"""
CBDC Sentence Extraction from BIS Central Bank Speeches
--------------------------------------------------------

This script identifies CBDC-related sentences from preprocessed BIS central bank speeches
using a structured list of CBDC-related keywords.

Functionality:
- Supports two types of keywords:
    • Hard keywords: Direct CBDC-related terms (e.g., "cbdc", "digital euro").
    • Soft keywords: General digital finance terms (e.g., "digital currency") — only matched when "central bank" is also present in the sentence.
- Uses regular expressions for efficient and precise keyword matching with word boundaries.
- Processes pre-tokenized (sentence-split) speech data stored as lists in the 'processed_text' column.
- One sentence is matched per keyword (first match per sentence).
- Matching is case-insensitive, assuming pre-lowered and cleaned text.
"""


In [3]:
import pandas as pd
import os
import re
from tqdm import tqdm

# === 1. Load and classify CBDC keywords ===
keywords_path = "cbdc_keywords.csv"
keywords_df = pd.read_csv(keywords_path)

# Ensure lowercase
keywords_df['keyword'] = keywords_df['cbdc keywords'].astype(str).str.lower().str.strip()
keywords_df['keyword_type'] = keywords_df['keyword type'].str.lower().str.strip()

# Separate hard and soft keyword lists
hard_keywords = keywords_df[keywords_df['keyword_type'] == 'hard keyword']['keyword'].tolist()
soft_keywords = keywords_df[keywords_df['keyword_type'] == 'soft keyword']['keyword'].tolist()

# Pre-compile regex patterns
hard_patterns = [(kw, re.compile(r'\b' + re.escape(kw) + r'\b')) for kw in hard_keywords]
soft_patterns = [(kw, re.compile(r'\b' + re.escape(kw) + r'\b')) for kw in soft_keywords]

# === 2. Load preprocessed speech data ===
data_path = "Data/speeches_data_preprocessed.csv"
df = pd.read_csv(data_path)

# === 3. Match loop ===
matches = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔍 Matching CBDC sentences"):
    try:
        sentences = eval(row['processed_text']) if isinstance(row['processed_text'], str) else row['processed_text']
    except:
        continue

    for sentence in sentences:
        sentence_lc = sentence.lower()

        # First check hard keywords (no context needed)
        for kw, pat in hard_patterns:
            if pat.search(sentence_lc):
                matches.append({
                    "url": row.get("url", ""),
                    "description": row.get("description", ""),
                    "date": row.get("date", ""),
                    "author": row.get("author", ""),
                    "country": row.get("country", ""),
                    "processed_text": sentence,
                    "keyword": kw,
                    "sentence": sentence,
                    "match_type": "hard"
                })
                break  # one match per sentence is enough

        else:  # Only check soft if no hard match found
            if "central bank" in sentence_lc:
                for kw, pat in soft_patterns:
                    if pat.search(sentence_lc):
                        matches.append({
                            "url": row.get("url", ""),
                            "description": row.get("description", ""),
                            "date": row.get("date", ""),
                            "author": row.get("author", ""),
                            "country": row.get("country", ""),
                            "processed_text": sentence,
                            "keyword": kw,
                            "sentence": sentence,
                            "match_type": "soft"
                        })
                        break

# === 4. Save matches ===
result_df = pd.DataFrame(matches)
os.makedirs("Results", exist_ok=True)
output_file = "Results/2-cbdc_sentences.csv"
result_df.to_csv(output_file, index=False, encoding='utf-8')

print(f"\n✅ Saved {len(result_df)} matches to {output_file}")


🔍 Matching CBDC sentences: 100%|██████████| 19609/19609 [09:30<00:00, 34.35it/s]


✅ Saved 5446 matches to Results/2-cbdc_sentences.csv





# Manually Review 
Manually reviewed the CBDC sentences extracted from the keywords

In [4]:
import pandas as pd

# Load the labeled file
file_path = "Results/3-cbdc_sentences_labeled.csv"
df = pd.read_csv(file_path)

# Ensure the label column exists and is numeric
df['label'] = pd.to_numeric(df['label'], errors='coerce')

# Count class distribution
label_counts = df['label'].value_counts().sort_index()

print("📊 CBDC Sentence Label Distribution:")
print(f"Non-CBDC (label=0): {label_counts.get(0, 0)}")
print(f"CBDC     (label=1): {label_counts.get(1, 0)}")

# Optional: show class imbalance ratio
total = label_counts.sum()
if total > 0:
    cbdc_ratio = label_counts.get(1, 0) / total
    print(f"\n⚖️ CBDC ratio in data: {cbdc_ratio:.2%}")


📊 CBDC Sentence Label Distribution:
Non-CBDC (label=0): 56
CBDC     (label=1): 5390

⚖️ CBDC ratio in data: 98.97%


"""
Generate 5,610 Labeled Non-CBDC Sentences for Model Training
-------------------------------------------------------------

This script constructs a balanced set of 5,610 non-CBDC sentences from BIS central bank speeches
to train a CBDC classifier. It includes:

1. Pre-labeled Non-CBDC Sentences (56):
   - Already manually verified as not CBDC-related.

2. Hard Negatives (2,777 sentences):
   - From speeches that contain CBDC sentences.
   - Not labeled as CBDC.
   - Up to 5 randomly sampled per speech.

3. Easy Negatives (2,777 sentences):
   - From general speeches without any CBDC-labeled content.
   - Up to 5 randomly sampled per speech.
"""

In [5]:
import pandas as pd
import os
import random
from ast import literal_eval
from tqdm import tqdm

# === File paths ===
preprocessed_file = "Data/speeches_data_preprocessed.csv"
labeled_cbdc_file = "Results/3-cbdc_sentences_labeled.csv"
output_file = "Results/4-non_cbdc_sentences_labeled.csv"

# === Parameters ===
max_per_speech = 5
target_hard = 2777
target_easy = 2777

# === Load data ===
df_speeches = pd.read_csv(preprocessed_file)
df_cbdc_labeled = pd.read_csv(labeled_cbdc_file)

# Ensure processed_text is list
df_speeches['processed_text'] = df_speeches['processed_text'].apply(
    lambda x: literal_eval(x) if isinstance(x, str) else x
)

# Get sets
cbdc_urls = set(df_cbdc_labeled[df_cbdc_labeled['label'] == 1]['url'])
cbdc_sentences_set = set(df_cbdc_labeled[df_cbdc_labeled['label'] == 1]['sentence'])

# Load pre-labeled non-CBDC (label=0)
pre_labeled_non_cbdc = df_cbdc_labeled[df_cbdc_labeled['label'] == 0]
pre_labeled_non_cbdc = pre_labeled_non_cbdc.drop_duplicates(subset=["sentence"])
print(f"✅ Loaded {len(pre_labeled_non_cbdc)} pre-labeled non-CBDC sentences.")

# === Hard negatives ===
df_cbdc_speeches = df_speeches[df_speeches['url'].isin(cbdc_urls)]
hard_negatives = []

print("🔍 Sampling hard negatives from CBDC speeches...")
for _, row in tqdm(df_cbdc_speeches.iterrows(), total=len(df_cbdc_speeches), desc="CBDC Speeches"):
    non_cbdc_sentences = [
        s for s in row['processed_text'] if s not in cbdc_sentences_set
    ]
    sampled = random.sample(non_cbdc_sentences, min(len(non_cbdc_sentences), max_per_speech))
    for sentence in sampled:
        hard_negatives.append({
            "url": row['url'],
            "description": row['description'],
            "date": row['date'],
            "author": row['author'],
            "country": row['country'],
            "processed_text": row['processed_text'],
            "from_speech": "cbdc_speech",
            "sentence": sentence,
            "label": 0
        })

random.shuffle(hard_negatives)
hard_negatives = hard_negatives[:target_hard]

# === Easy negatives ===
df_general_speeches = df_speeches[~df_speeches['url'].isin(cbdc_urls)]
easy_negatives = []

print("🔍 Sampling easy negatives from general speeches...")
for _, row in tqdm(df_general_speeches.iterrows(), total=len(df_general_speeches), desc="General Speeches"):
    sampled = random.sample(row['processed_text'], min(len(row['processed_text']), max_per_speech))
    for sentence in sampled:
        easy_negatives.append({
            "url": row['url'],
            "description": row['description'],
            "date": row['date'],
            "author": row['author'],
            "country": row['country'],
            "processed_text": row['processed_text'],
            "from_speech": "general_speech",
            "sentence": sentence,
            "label": 0
        })

random.shuffle(easy_negatives)
easy_negatives = easy_negatives[:target_easy]

# === Combine all ===
final_df = pd.concat([
    pre_labeled_non_cbdc,
    pd.DataFrame(hard_negatives),
    pd.DataFrame(easy_negatives)
], ignore_index=True)

# Save
os.makedirs("Results", exist_ok=True)
final_df.to_csv(output_file, index=False, encoding="utf-8")

print(f"\n✅ Final non-CBDC labeled sentences saved: {len(final_df)}")
print(f"📁 Saved to: {output_file}")

✅ Loaded 56 pre-labeled non-CBDC sentences.
🔍 Sampling hard negatives from CBDC speeches...


CBDC Speeches: 100%|██████████| 588/588 [00:01<00:00, 376.62it/s]


🔍 Sampling easy negatives from general speeches...


General Speeches: 100%|██████████| 19021/19021 [00:03<00:00, 4846.71it/s]



✅ Final non-CBDC labeled sentences saved: 5610
📁 Saved to: Results/4-non_cbdc_sentences_labeled.csv


"""
Generate BERT Training Data: 5,390 CBDC + 5,610 Non-CBDC Sentences
------------------------------------------------------------------

This script prepares labeled training data for a CBDC sentence classifier (BERT-based),
by combining:

1. All 5,390 CBDC-labeled sentences (label=1).
2. 5,610 Non-CBDC-labeled sentences (label=0), including:
   - 56 manually verified examples from the CBDC-labeled file.
   - 5,554 additional random examples from preselected non-CBDC data.

The final dataset contains 11,000 labeled examples (balanced for model training).
"""

In [6]:
import pandas as pd
import os

# === File paths ===
cbdc_file = "Results/3-cbdc_sentences_labeled.csv"
non_cbdc_file = "Results/4-non_cbdc_sentences_labeled.csv"
output_file = "Results/bert_training_data.csv"

# === Load datasets ===
df_cbdc_full = pd.read_csv(cbdc_file)
df_non_cbdc_pool = pd.read_csv(non_cbdc_file)

# === Split manually verified non-CBDC from CBDC file (label=0) ===
df_verified_non_cbdc = df_cbdc_full[df_cbdc_full["label"] == 0]
df_cbdc_only = df_cbdc_full[df_cbdc_full["label"] == 1]

# === Determine how many more non-CBDC sentences are needed ===
non_cbdc_needed = 5610 - len(df_verified_non_cbdc)

# === Sample additional non-CBDC sentences ===
df_non_cbdc_sampled = df_non_cbdc_pool.sample(n=non_cbdc_needed, random_state=42)

# === Combine all parts ===
df_final = pd.concat([
    df_cbdc_only[["url", "sentence", "label"]],
    df_verified_non_cbdc[["url", "sentence", "label"]],
    df_non_cbdc_sampled[["url", "sentence", "label"]]
], ignore_index=True)

# === Shuffle ===
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# === Save ===
os.makedirs("Results", exist_ok=True)
df_final.to_csv(output_file, index=False, encoding="utf-8")

# === Report ===
print(f"\n✅ Final dataset saved to: {output_file}")
print("📊 Label distribution:")
print(df_final['label'].value_counts())
print(f"🧾 Total samples: {len(df_final)}")



✅ Final dataset saved to: Results/bert_training_data.csv
📊 Label distribution:
label
0    5610
1    5390
Name: count, dtype: int64
🧾 Total samples: 11000
