In [4]:
import pandas as pd
import gzip
import json

In [5]:
def load_classification(filename):
    data = []

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            try:
                id_part, value_part = line.split(':')
                id_num = id_part.strip()
                values = value_part.strip()
                if values.lower() == 'none':
                    value_list = []
                else:
                    value_list = [str(v.strip()) for v in values.split(',')]
                data.append({'id': id_num, 'values': value_list})
            except ValueError as e:
                print(f"Skipping malformed line: {line}. Error: {e}")

    df = pd.DataFrame(data)
    return df

In [6]:
def load_list(filename):
    """
    Loads a gzipped JSON Lines (jsonl) file and returns a list of dictionaries.

    Parameters:
        filename (str): The filename of the gzipped jsonl file.

    Returns:
        list: A list of dictionaries read from the file.
    """
    result = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in f:
            result.append(json.loads(line))
    return result

In [7]:
def extract_goals(df):
    def get_unique_as(values_list):
        # Split each string by '.', take the part before '.', and collect unique values
        return list({v.split('.')[0] for v in values_list})
    
    df['goals'] = df['values'].apply(get_unique_as)
    return df

In [8]:
classif_df=load_classification("classification.txt")

In [9]:
filename="sgd_targets.dat"
df_targets = pd.read_pickle(filename)

In [10]:
df_targets

Unnamed: 0,Target ID,Target Text
0,1.1,No Poverty. Eradicate extreme poverty for all ...
1,1.2,No Poverty. Reduce at least by half the propor...
2,1.3,No Poverty. Implement nationally appropriate s...
3,1.4,"No Poverty. Ensure that all men and women, in ..."
4,1.5,No Poverty. Build the resilience of the poor a...
...,...,...
161,17.15,Partnerships for the Goals. Respect each count...
162,17.16,Partnerships for the Goals. Enhance the Global...
163,17.17,Partnerships for the Goals. Encourage and prom...
164,17.18,Partnerships for the Goals. Enhance capacity-b...


In [16]:
import os
import json
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# -- Widgets for file operations --
classif_file_input = widgets.Text(value='classification.txt', description='Classification Name:')
load_button = widgets.Button(description='Load')

filename_input = widgets.Text(value='', description='Filename:')
create_button = widgets.Button(description='Create')

# -- Display controls --
prev_button = widgets.Button(description='←')
next_button = widgets.Button(description='→')
id_label = widgets.Label(value='')
text_area = widgets.Textarea(value='', description='Text:', layout=widgets.Layout(width='100%', height='100px'))

targets_box = widgets.VBox()
update_button = widgets.Button(description='Update')
out = widgets.Output(layout=widgets.Layout(height='1500px', overflow_y='no'))
# -- Data storage --
classif_file = None
data_by_id = {}
classif = pd.DataFrame()        # List of {id: [target_ids]}
df = pd.DataFrame()  # Loaded data
current_idx = 0      # Index for navigation

# -- Handlers --
def load_classif(b):
    global classif_file, classif
    classif=load_classification(classif_file_input.value)
    reset_navigation()


def create_classif(b):
    global df, classif, classif_file
    filename = filename_input.value
    df = pd.DataFrame(load_list(filename))  # External function must return DataFrame with 'id' and 'text'
    classif_file = classif_file_input.value
    # Initialize classification list with empty lists
    classif = [{str(row['id']): []} for _, row in df.iterrows()]
    load_suggestion()
    reset_navigation()


def reset_navigation():
    global current_idx
    current_idx = 0
    update_display()

def load_suggestion():
    global data_by_id
    filename="classif_results.json"
    with open(filename, "r", encoding="utf-8") as f:
       data = json.load(f)
    for entry in data:
        scores = entry.get("prediction", [])
        # Création de la chaîne filtrée
        filtered_scores = [f"{i+1}: {score:.2f}" for i, score in enumerate(scores) if score > 0.20]
        entry["scores_str"] = ", ".join(filtered_scores)
        # Ajout dans le dict avec l'id comme clé
        data_by_id[entry["id"]] = entry

def update_display():
    global classif, data_by_id
    with out:
        out.clear_output(wait=True)
        if not classif or current_idx >= len(classif):
            display(widgets.Label(value="No classification data available."))
            return
    
        current = classif[current_idx]
        ids = list(current.keys())[0]
        record_id = ids
        # Show navigation and text
        id_label.value = f"ID: {record_id} - "+data_by_id[record_id]["scores_str"]
        text_value = df.loc[df['id'] == record_id, 'text']
        text_area.value = text_value.iloc[0] if not text_value.empty else ''
        display(widgets.HBox([prev_button, next_button, id_label]))
        display(text_area)
    
        # Build target checkboxes
        targets_box.children = []
        boxes = []
        if 'df_targets' in globals():
            for _, row in df_targets.iterrows():
                tid, txt = row['Target ID'], row['Target Text']
                cb = widgets.Checkbox(value=(tid in current), description=f"{tid}: {txt}",layout=widgets.Layout(width='100%'))
                def on_change(change, tid=tid, rec_id=record_id):
                    if change['new']:
                        if tid not in classif[rec_id]:
                            classif[rec_id].append(tid)
                    else:
                        if tid in classif[rec_id]:
                            classif[rec_id].remove(tid)
                cb.observe(on_change, names='value')
                boxes.append(cb)
            targets_box.children = boxes
            display(targets_box, update_button)
        else:
            display(widgets.Label(value="df_targets is not defined."))


def on_prev(b):
    global current_idx
    if current_idx > 0:
        current_idx -= 1
        update_display()

def on_next(b):
    global current_idx
    if current_idx < len(classif) - 1:
        current_idx += 1
        update_display()

def on_update(b):
    if classif_file:
        with open(classif_file, 'w') as f:
            json.dump(classif, f, indent=2)
        print(f"Saved classification to {classif_file}")

# -- Bind events --
load_button.on_click(load_classif)
create_button.on_click(create_classif)
prev_button.on_click(on_prev)
next_button.on_click(on_next)
update_button.on_click(on_update)

# -- Initial UI --
display(widgets.VBox([widgets.HBox([classif_file_input, load_button]),
                       widgets.HBox([filename_input, create_button]),
                       out]))



VBox(children=(HBox(children=(Text(value='classification.txt', description='Classification Name:'), Button(des…