In [1]:
# notebooks/00_data_labeling.ipynb
import pandas as pd
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output

In [2]:
# Load data
data_path = Path(__name__).resolve().parent.parent 
df = pd.read_csv(data_path / 'data/risk_paragraphs_unlabeled.csv')

In [3]:
df.head()

Unnamed: 0,cik,paragraph,word_count,year
0,1000228,The health care products distribution industry...,1037,2022
1,1000228,Uncertain global macro-economic and political ...,423,2022
2,1000228,Security risks generally associated with our i...,1240,2022
3,1000697,The Company’s international operations may be ...,592,2022
4,1000697,We may not be able to attract and retain quali...,273,2022


In [4]:
# Define categories
categories = [
    'Market Risk',
    'Credit Risk', 
    'Operational Risk',
    'Liquidity Risk',
    'Legal/Regulatory Risk',
    'Strategic Risk',
    'Reputational Risk',
    'Ambiguous'
]

In [5]:
# Create labels column if doesn't exist
if 'labels' not in df.columns:
    df['labels'] = ''

In [6]:
df.head()

Unnamed: 0,cik,paragraph,word_count,year,labels
0,1000228,The health care products distribution industry...,1037,2022,
1,1000228,Uncertain global macro-economic and political ...,423,2022,
2,1000228,Security risks generally associated with our i...,1240,2022,
3,1000697,The Company’s international operations may be ...,592,2022,
4,1000697,We may not be able to attract and retain quali...,273,2022,


In [7]:
# Labeling interface
def create_labeling_interface():
    current_index = [0]  # Mutable to track state
    
    # UI elements
    text_display = widgets.HTML()
    checkboxes = {cat: widgets.Checkbox(description=cat, value=False) 
                  for cat in categories}
    
    save_button = widgets.Button(description='Save & Next')
    skip_button = widgets.Button(description='Skip')
    progress_label = widgets.Label()
    
    def update_display():
        idx = current_index[0]
        if idx >= len(df):
            text_display.value = "<h3>✓ Labeling Complete!</h3>"
            return
        
        row = df.iloc[idx]
        text_display.value = f"""
        <div style='padding: 20px; background: #f5f5f5; border-radius: 10px;'>
            <h3>Paragraph {idx + 1} / {len(df)}</h3>
            <p><strong>Company CIK:</strong> {row['cik']} | <strong>Year:</strong> {row['year']}</p>
            <p style='font-size: 14px; line-height: 1.6;'>{row['paragraph']}</p>
        </div>
        """
        
        # Reset checkboxes
        for cb in checkboxes.values():
            cb.value = False
        
        # Load existing labels if any
        if pd.notna(row['labels']) and row['labels']:
            existing_labels = row['labels'].split(',')
            for label in existing_labels:
                if label.strip() in checkboxes:
                    checkboxes[label.strip()].value = True
        
        progress_label.value = f"Progress: {idx}/{len(df)} ({idx/len(df)*100:.1f}%)"
    
    def save_and_next(b):
        idx = current_index[0]
        if idx < len(df):
            # Get selected labels
            selected = [cat for cat, cb in checkboxes.items() if cb.value]
            df.at[idx, 'labels'] = ','.join(selected)
            
            # Save progress every 10 paragraphs
            
            if idx % 1 == 0:
                df.to_csv(data_path / 'data/risk_paragraphs_labeled.csv', index=False)
                print(f"✓ Progress saved at {idx}")
            
            current_index[0] += 1
            update_display()
    
    def skip(b):
        current_index[0] += 1
        update_display()
    
    save_button.on_click(save_and_next)
    skip_button.on_click(skip)
    
    # Layout
    checkbox_box = widgets.VBox(list(checkboxes.values()))
    buttons = widgets.HBox([save_button, skip_button])
    
    display(widgets.VBox([
        progress_label,
        text_display,
        widgets.HTML("<h4>Select applicable risk categories:</h4>"),
        checkbox_box,
        buttons
    ]))
    
    update_display()

# Run interface
create_labeling_interface()

VBox(children=(Label(value=''), HTML(value=''), HTML(value='<h4>Select applicable risk categories:</h4>'), VBo…