In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
import random
pd.set_option('display.max_colwidth', 100)

In [2]:
class_name = 't'

# Training Set
# Load true labels
train_labels = pd.read_csv("../../radnlp_2024_train_val_20240731/en/main_task/train/label.csv")
train_path = "../prediction/ensemble/train/"

# Load predictions
# deepseek-reasoner (r1)
r1_train_1 = pd.read_csv(f"{train_path}deepseek-reasoner_train_01.csv")
r1_train_2 = pd.read_csv(f"{train_path}deepseek-reasoner_train_02.csv")
r1_train_3 = pd.read_csv(f"{train_path}deepseek-reasoner_train_03.csv")

# o1
o1_train_1 = pd.read_csv(f"{train_path}o1-mini_train_01.csv")
o1_train_2 = pd.read_csv(f"{train_path}o1-mini_train_02.csv")
o1_train_3 = pd.read_csv(f"{train_path}o1-mini_train_03.csv")

# Validation Set
val_labels = pd.read_csv("../../radnlp_2024_train_val_20240731/en/main_task/val/label.csv")
val_path = "../prediction/ensemble/val/"
# r1
r1_val_1 = pd.read_csv(f"{val_path}deepseek-reasoner_val_01.csv")
r1_val_2 = pd.read_csv(f"{val_path}deepseek-reasoner_val_02.csv")
r1_val_3 = pd.read_csv(f"{val_path}deepseek-reasoner_val_03.csv")
# o1
o1_val_1 = pd.read_csv(f"{val_path}o1-mini_val_01.csv")
o1_val_2 = pd.read_csv(f"{val_path}o1-mini_val_02.csv")
o1_val_3 = pd.read_csv(f"{val_path}o1-mini_val_03.csv")

In [3]:
# Concatenate labels
labels = pd.concat([train_labels, val_labels], ignore_index=True)

# Concatenate predictions for r1 (deepseek-reasoner)
r1_pred_1 = pd.concat([r1_train_1, r1_val_1], ignore_index=True)
r1_pred_2 = pd.concat([r1_train_2, r1_val_2], ignore_index=True)
r1_pred_3 = pd.concat([r1_train_3, r1_val_3], ignore_index=True)

# Concatenate predictions for o1 (o1-mini)
o1_pred_1 = pd.concat([o1_train_1, o1_val_1], ignore_index=True)
o1_pred_2 = pd.concat([o1_train_2, o1_val_2], ignore_index=True)
o1_pred_3 = pd.concat([o1_train_3, o1_val_3], ignore_index=True)

In [4]:
data = pd.DataFrame({
    f'r1_1_{class_name}': r1_pred_1[class_name],
    f'r1_2_{class_name}': r1_pred_2[class_name],
    f'r1_3_{class_name}': r1_pred_3[class_name],
    f'o1_1_{class_name}': o1_pred_1[class_name],
    f'o1_2_{class_name}': o1_pred_2[class_name],
    f'o1_3_{class_name}': o1_pred_3[class_name],
    'label': labels[class_name],
})
data

Unnamed: 0,r1_1_n,r1_2_n,r1_3_n,o1_1_n,o1_2_n,o1_3_n,label
0,N3,N3,N3,N3,N3,N3,N3
1,N1,N1,N1,N1,N1,N1,N0
2,N0,N0,N0,N0,N0,N0,N0
3,N2,N2,N2,N2,N2,N2,N2
4,N1,N1,N1,N1,N1,N1,N0
...,...,...,...,...,...,...,...
157,N0,N0,N0,N0,N0,N0,N0
158,N0,N0,N0,N0,N0,N0,N0
159,N0,N0,N0,N0,N0,N0,N0
160,N0,N0,N0,N0,N0,N0,N0


In [5]:
def stratified_split(df, label_column, test_size=0.2, random_state=42):
    # Separate classes with only one sample
    class_counts = df[label_column].value_counts()
    single_sample_classes = class_counts[class_counts == 1].index
    multiple_sample_classes = class_counts[class_counts > 1].index
    
    # Assign single-sample classes to the train set
    single_sample_train = df[df[label_column].isin(single_sample_classes)]
    
    # Split the remaining classes
    multiple_sample_data = df[df[label_column].isin(multiple_sample_classes)]
    train_data, test_data = train_test_split(
        multiple_sample_data,
        test_size=test_size,
        stratify=multiple_sample_data[label_column],
        random_state=random_state
    )
    
    # Combine the train sets
    train_data = pd.concat([train_data, single_sample_train], ignore_index=True)
    
    return train_data, test_data

train_df, val_df = stratified_split(data, label_column='label')
print("Training size:", train_df.shape[0])
print("Classes:", train_df['label'].unique())
print("Val size:", val_df.shape[0])
print("Classes:", val_df['label'].unique())
train_df.head()

Training size: 129
Classes: ['N0' 'N2' 'N3' 'N1']
Val size: 33
Classes: ['N3' 'N0' 'N2' 'N1']


Unnamed: 0,r1_1_n,r1_2_n,r1_3_n,o1_1_n,o1_2_n,o1_3_n,label
0,N0,N0,N0,N0,N0,N0,N0
1,N2,N2,N2,N2,N2,N2,N2
2,N0,N0,N0,N0,N0,N0,N0
3,N0,N0,N0,N0,N0,N0,N0
4,N2,N2,N2,N2,N2,N2,N2


In [6]:
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']
X_val = val_df.drop(columns=['label'])
y_val = val_df['label']

In [7]:
# label mapping
t_mapping = {
    'T0': 0,
    'Tis': 1,
    'T1mi': 2, 
    'T1b': 3,
    'T1c': 4,
    'T2a': 5,
    'T2b': 6,
    'T3': 7,
    'T4': 8,
    'T2': 5,
    'T1a': 9,
}

n_mapping = {
    'N0': 0,
    'N1': 1,
    'N2': 2,
    'N3': 3,
}

m_mapping = {
    'M0' : 0,
    'M1a' : 1,
    'M1b' : 2,
    'M1c' : 3,
}
mapping_dict = {
    't': t_mapping,
    'n': n_mapping,
    'm': m_mapping
}

# Ensure the class_name is valid
if class_name in mapping_dict:
    mapping = mapping_dict[class_name]  # Get the correct mapping
    X_train = pd.DataFrame(X_train).replace(mapping).values.tolist()
    y_train = pd.Series(y_train).replace(mapping).tolist()
    X_val = pd.DataFrame(X_val).replace(mapping).values.tolist()
    y_val = pd.Series(y_val).replace(mapping).tolist()
else:
    raise ValueError(f"Invalid class_name: {class_name}. Must be one of {list(mapping_dict.keys())}")
print("X_train:\n", X_train[:5])
print("y_train:\n", y_train[:5])
print("X_val:\n", X_val[:5])
print("y_val:\n", y_val[:5])

X_train:
 [[0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 2, 2], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 2, 2]]
y_train:
 [0, 2, 0, 0, 2]
X_val:
 [[3, 3, 3, 2, 3, 3], [0, 0, 0, 0, 0, 0], [2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1]]
y_val:
 [3, 0, 2, 2, 0]


  X_train = pd.DataFrame(X_train).replace(mapping).values.tolist()
  y_train = pd.Series(y_train).replace(mapping).tolist()
  X_val = pd.DataFrame(X_val).replace(mapping).values.tolist()
  y_val = pd.Series(y_val).replace(mapping).tolist()


In [8]:
# Define the parameter distributions to sample from
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 5, 6, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'gamma': [0, 0.1, 0.2, 0.5]
}

best_acc = 0
best_params = {}
n_iter = 2000  # Number of random combinations to try

for _ in range(n_iter):
    # Randomly sample hyperparameters
    params = {
        'learning_rate': random.choice(param_dist['learning_rate']),
        'max_depth': random.choice(param_dist['max_depth']),
        'subsample': random.choice(param_dist['subsample']),
        'colsample_bytree': random.choice(param_dist['colsample_bytree']),
        'reg_alpha': random.choice(param_dist['reg_alpha']),
        'reg_lambda': random.choice(param_dist['reg_lambda']),
        'gamma': random.choice(param_dist['gamma'])
    }
    
    # Initialize and train the model
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(X_train[0]),
        eval_metric='mlogloss',
        n_estimators=1000,
        early_stopping_rounds=10,
        random_state=42,
        **params
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Evaluate on validation set
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    
    # Update best parameters if improvement
    if acc >= best_acc:
        best_acc = acc
        best_params = params
        print(f"New Best Accuracy: {best_acc:.4f}")
        print("Parameters:", best_params)

print("\n=== Best Parameters ===")
print(best_params)
print(f"Validation Accuracy: {best_acc:.4f}")

New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.01, 'max_depth': 7, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0, 'gamma': 0}
New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 0.1, 'gamma': 0.1}
New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 1.0, 'reg_alpha': 1, 'reg_lambda': 0.5, 'gamma': 0}
New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.05, 'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 0, 'gamma': 0.2}
New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_alpha': 1, 'reg_lambda': 0.5, 'gamma': 0}
New Best Accuracy: 0.9091
Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 1, 'reg_lambda': 1, 'gamma

In [9]:
try:
    X_train_full = np.concatenate([X_train, X_val], axis=0)
    y_train_full = np.concatenate([y_train, y_val], axis=0)

    final_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=9,
        eval_metric='mlogloss',
        n_estimators=1000, 
        random_state=42,
        **best_params  
    )

    # Train on the full dataset
    final_model.fit(X_train_full, y_train_full, verbose=False)
except AttributeError as e:
    print(f"AttributeError: {e}")

In [10]:
import joblib
joblib.dump(final_model, f'../ensemble_model/xgboost_model_{class_name}.joblib')

['../ensemble_model/xgboost_model_n.joblib']

In [11]:
loaded_model = joblib.load(f'../ensemble_model/xgboost_model_{class_name}.joblib')

# Use the model
y_test_pred = loaded_model.predict(X_val)
acc = accuracy_score(y_val, y_test_pred)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.9394


    t_best_params = {'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 1, 'reg_lambda': 0.5, 'gamma': 0}

    n_best_params = 

    m_best_params = {'learning_rate': 0.01, 'max_depth': 6, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 1, 'reg_lambda': 0.1, 'gamma': 0}