## Import Library

In [17]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score
from collections import defaultdict

## Read Data

In [18]:
dataset = 'train'

test_path = f"./prediction/ensemble/{dataset}/"
# deepseek-reasoner (r1)
r1_test_1 = pd.read_csv(f"{test_path}deepseek-reasoner_{dataset}_01.csv")
r1_test_2 = pd.read_csv(f"{test_path}deepseek-reasoner_{dataset}_02.csv")
r1_test_3 = pd.read_csv(f"{test_path}deepseek-reasoner_{dataset}_03.csv")

# openai-o1-mini (o1)
o1_test_1 = pd.read_csv(f"{test_path}o1-mini_{dataset}_01.csv")
o1_test_2 = pd.read_csv(f"{test_path}o1-mini_{dataset}_02.csv")
o1_test_3 = pd.read_csv(f"{test_path}o1-mini_{dataset}_03.csv")

# load labels
train_labels = pd.read_csv("../radnlp_2024_train_val_20240731/en/main_task/train/label.csv")
val_labels = pd.read_csv("../radnlp_2024_train_val_20240731/en/main_task/val/label.csv")


In [19]:
classes = ['t', 'n', 'm']
preds = {}

for class_name in classes:
    # Create a DataFrame (or dictionary) for the current class
    preds[class_name] = {
        f'r1_1_{class_name}': r1_test_1[class_name],
        f'r1_2_{class_name}': r1_test_2[class_name],
        f'r1_3_{class_name}': r1_test_3[class_name],
        f'o1_1_{class_name}': o1_test_1[class_name],
        f'o1_2_{class_name}': o1_test_2[class_name],
        f'o1_3_{class_name}': o1_test_3[class_name],
    }
    
preds_dfs = {}  # Dictionary to store DataFrames
for class_name in classes:
    preds_dfs[class_name] = pd.DataFrame({
        f'r1_1_{class_name}': r1_test_1[class_name],
        f'r1_2_{class_name}': r1_test_2[class_name],
        f'r1_3_{class_name}': r1_test_3[class_name],
        f'o1_1_{class_name}': o1_test_1[class_name],
        f'o1_2_{class_name}': o1_test_2[class_name],
        f'o1_3_{class_name}': o1_test_3[class_name],
    })
    
    if dataset == 'train':
        preds_dfs[class_name][f'label_{class_name}'] = train_labels[class_name]
    elif dataset == 'val':
        preds_dfs[class_name][f'label_{class_name}'] = val_labels[class_name]
    
    print(preds_dfs[class_name].head())

  r1_1_t r1_2_t r1_3_t o1_1_t o1_2_t o1_3_t label_t
0     T4     T4     T4     T4     T4     T4      T4
1    T1c    T1c    T1c    T1c    T1c    T1c     T1c
2     T3     T3     T3     T3     T3     T3      T3
3     T4     T4     T4     T4     T4     T4      T4
4    T2b    T2b    T2b    T2b    T2b    T2b     T2b
  r1_1_n r1_2_n r1_3_n o1_1_n o1_2_n o1_3_n label_n
0     N3     N3     N3     N3     N3     N3      N3
1     N1     N1     N1     N1     N1     N1      N0
2     N0     N0     N0     N0     N0     N0      N0
3     N2     N2     N2     N2     N2     N2      N2
4     N1     N1     N1     N1     N1     N1      N0
  r1_1_m r1_2_m r1_3_m o1_1_m o1_2_m o1_3_m label_m
0     M0     M0     M0     M0     M0     M0      M0
1     M0     M0     M0     M0     M0     M0      M0
2     M0     M0     M0     M0     M0     M0      M0
3    M1c    M1c    M1c    M1c    M1c    M1c     M1c
4     M0     M0     M0     M0     M0     M0      M0


In [20]:
# Mapping dictionaries
t_mapping = {
    'T0': 0, 'Tis': 1, 'T1mi': 2, 'T1b': 3, 'T1c': 4,
    'T2a': 5, 'T2b': 6, 'T3': 7, 'T4': 8, 'T2': 5, 'T1a': 9
}

n_mapping = {'N0': 0, 'N1': 1, 'N2': 2, 'N3': 3}
m_mapping = {'M0': 0, 'M1a': 1, 'M1b': 2, 'M1c': 3}

mapping_dict = {'t': t_mapping, 'n': n_mapping, 'm': m_mapping}

reverse_mapping = {
    't': {v: k for k, v in t_mapping.items()},
    'n': {v: k for k, v in n_mapping.items()},
    'm': {v: k for k, v in m_mapping.items()},
}
reverse_mapping['t'][5] = 'T2a'

for class_name in classes:
    class_mapping = mapping_dict[class_name]
    preds_dfs[class_name].replace(class_mapping, inplace=True)

    # Print results
    print(f"\n{class_name.upper()} DataFrame after replacement:")
    print(preds_dfs[class_name].head())


T DataFrame after replacement:
   r1_1_t  r1_2_t  r1_3_t  o1_1_t  o1_2_t  o1_3_t  label_t
0       8       8       8       8       8       8        8
1       4       4       4       4       4       4        4
2       7       7       7       7       7       7        7
3       8       8       8       8       8       8        8
4       6       6       6       6       6       6        6

N DataFrame after replacement:
   r1_1_n  r1_2_n  r1_3_n  o1_1_n  o1_2_n  o1_3_n  label_n
0       3       3       3       3       3       3        3
1       1       1       1       1       1       1        0
2       0       0       0       0       0       0        0
3       2       2       2       2       2       2        2
4       1       1       1       1       1       1        0

M DataFrame after replacement:
   r1_1_m  r1_2_m  r1_3_m  o1_1_m  o1_2_m  o1_3_m  label_m
0       0       0       0       0       0       0        0
1       0       0       0       0       0       0        0
2       0       0  

  preds_dfs[class_name].replace(class_mapping, inplace=True)
  preds_dfs[class_name].replace(class_mapping, inplace=True)
  preds_dfs[class_name].replace(class_mapping, inplace=True)


In [21]:
true_labels = pd.concat([train_labels, val_labels], ignore_index=True)
true_labels = true_labels['n']
true_labels.replace(mapping_dict['n'], inplace=True)
true_labels.head()

  true_labels.replace(mapping_dict['n'], inplace=True)


0    3
1    0
2    0
3    2
4    0
Name: n, dtype: int64

## Ensemble Learning

In [22]:
def vote(model_predictions, true_labels):
    # Accuracy for each model
    accuracies = {
        'r1_1_n': 0.9567901234567902,
        'r1_2_n': 0.9506172839506173,
        'r1_3_n': 0.9567901234567902,
        'o1_1_n': 0.9259259259259259,
        'o1_2_n': 0.9382716049382716,
        'o1_3_n': 0.9320987654320988
    } # From traing and val set (weighted_mjvote.ipynb)
    
    # Compute class frequencies for tie-breaking
    class_counts = true_labels.value_counts().to_dict()
    
    # Define the enhanced voting function for each row
    def enhanced_vote(row):
        vote_weights = defaultdict(float)
        for model in model_predictions.columns:
            class_val = row[model]
            vote_weights[class_val] += accuracies[model]
        max_weight = max(vote_weights.values(), default=0)
        candidates = [cls for cls, weight in vote_weights.items() if weight == max_weight]
        if len(candidates) > 1:
            # Use class frequency to break ties
            return max(candidates, key=lambda x: class_counts.get(x, 0))
        else:
            return candidates[0] if candidates else None  # Handle edge case with no votes
    
    # Apply the voting function to each row to get ensemble predictions
    ensemble_predictions = model_predictions.apply(enhanced_vote, axis=1)
    return ensemble_predictions

In [23]:
t_XGBoost = joblib.load(f'./ensemble_model/xgboost_model_t.joblib')
m_XGBoost = joblib.load(f'./ensemble_model/xgboost_model_m.joblib')

# Exclude the last column (labels) when making predictions
if dataset == 'train' or dataset == 'val':
    # class T
    t_ensemble = t_XGBoost.predict(preds_dfs['t'].iloc[:, :-1])
    acc_t = accuracy_score(preds_dfs['t']['label_t'], t_ensemble)
    
    # class N
    n_ensemble = vote(preds_dfs['n'].iloc[:, :-1], true_labels)
    acc_n = accuracy_score(preds_dfs['n']['label_n'], n_ensemble)
    
    # class M
    m_ensemble = m_XGBoost.predict(preds_dfs['m'].iloc[:, :-1])
    acc_m = accuracy_score(preds_dfs['m']['label_m'], m_ensemble)
    
    print(f"Accuracy(T): {acc_t:.4f}")
    print(f"Accuracy(N): {acc_n}")
    print(f"Accuracy(M): {acc_m:.4f}")
    
elif dataset == 'test':
    t_ensemble = t_XGBoost.predict(preds_dfs['t'])
    n_ensemble = vote(preds_dfs['n'], true_labels)
    m_ensemble = m_XGBoost.predict(preds_dfs['m'])
    
else:
    print('No dataset found.')

Accuracy(T): 0.9259
Accuracy(N): 0.9629629629629629
Accuracy(M): 0.9630


## Save Result

In [24]:
t_series = pd.Series(t_ensemble)
n_series = pd.Series(n_ensemble)
m_series = pd.Series(m_ensemble)

# Replace values using the reverse_mapping dictionary
t_series.replace(reverse_mapping['t'], inplace=True)
n_series.replace(reverse_mapping['n'], inplace=True)
m_series.replace(reverse_mapping['m'], inplace=True)

# Create the DataFrame
result_df = pd.DataFrame({
    'id': r1_test_1['id'],
    't': t_series,
    'n': n_series,
    'm': m_series,
})
print(result_df.head())
result_df.to_csv(f'./prediction/final_submittion/result_{dataset}.csv', index=False)

       id    t   n    m
0   56344   T4  N3   M0
1  133166  T1c  N1   M0
2  165742   T3  N0   M0
3  404886   T4  N2  M1c
4  463397  T2b  N1   M0
