**First we load and merge train, val and test datasets**

In [1]:
import pandas as pd

# 1. Load Training Data
train_args = pd.read_csv("data/arguments-training.tsv", sep='\t')
train_labels = pd.read_csv("data/labels-training.tsv", sep='\t')
df_train = pd.merge(train_args, train_labels, on="Argument ID")

# 2. Load Validation Data
val_args = pd.read_csv("data/arguments-validation.tsv", sep='\t')
val_labels = pd.read_csv("data/labels-validation.tsv", sep='\t')
df_val = pd.merge(val_args, val_labels, on="Argument ID")

# 3. Load Test Data (Crucial for volume!)
test_args = pd.read_csv("data/arguments-test.tsv", sep='\t')
test_labels = pd.read_csv("data/labels-test.tsv", sep='\t')
df_test = pd.merge(test_args, test_labels, on="Argument ID")

# 4. Concatenate EVERYTHING into one giant dataset
trainval_df = pd.concat([df_train, df_val, df_test], ignore_index=True)

# 5. Verify the size (Should be > 8,500)
print(f"Total Examples: {len(trainval_df)}")
print(trainval_df.head(3))

Total Examples: 8865
  Argument ID                                   Conclusion       Stance  \
0      A01002                  We should ban human cloning  in favor of   
1      A01005                      We should ban fast food  in favor of   
2      A01006  We should end the use of economic sanctions      against   

                                             Premise  Self-direction: thought  \
0  we should ban human cloning as it will only ca...                        0   
1  fast food should be banned because it is reall...                        0   
2  sometimes economic sanctions are the only thin...                        0   

   Self-direction: action  Stimulation  Hedonism  Achievement  \
0                       0            0         0            0   
1                       0            0         0            0   
2                       0            0         0            0   

   Power: dominance  ...  Tradition  Conformity: rules  \
0                 0  ...          

**Examples**

In [5]:
import pandas as pd

# 1. Load the Data
# Using validation set because it's cleaner for inspection
df_args = pd.read_csv("data/arguments-validation.tsv", sep='\t')
df_labels = pd.read_csv("data/labels-validation.tsv", sep='\t')

# 2. Merge them
val_df = pd.merge(df_args, df_labels, on="Argument ID")

# 3. Identify the Value Columns (The 19 or 20 labels)
# We exclude the text columns to find just the label columns
metadata_cols = ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Language']
value_cols = [col for col in val_df.columns if col not in metadata_cols]

# 4. Display 5 Random Examples
# Change random_state to see different examples
samples = val_df.sample(5, random_state=42) 

for idx, row in samples.iterrows():
    print(f"üÜî ID: {row['Argument ID']}")
    print(f"üì¢ CONCLUSION: {row['Conclusion']}")
    print(f"‚öñÔ∏è STANCE: {row['Stance']}")
    print(f"üìù PREMISE: {row['Premise']}")
    print("-" * 30)
    print("üß† ACTUAL HUMAN VALUES (Ground Truth):")
    
    # Iterate through the columns and print only the ones marked as '1'
    has_values = False
    for val in value_cols:
        if row[val] == 1:
            print(f"   ‚úÖ {val}")
            has_values = True
            
    if not has_values:
        print("   (No values annotated)")
        
    print("=" * 80 + "\n")

üÜî ID: A28426
üì¢ CONCLUSION: Payday loans should be banned
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: payday loans should be banned because it causes people to go into debt
------------------------------
üß† ACTUAL HUMAN VALUES (Ground Truth):
   ‚úÖ Power: resources
   ‚úÖ Security: personal

üÜî ID: A21315
üì¢ CONCLUSION: Homeopathy brings more harm than good
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: introducing items that normally produce symptoms of a disease is something that really could do more harm than good in the long run.
------------------------------
üß† ACTUAL HUMAN VALUES (Ground Truth):
   ‚úÖ Security: personal
   ‚úÖ Universalism: objectivity

üÜî ID: A25015
üì¢ CONCLUSION: Payday loans should be banned
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: payday loans allow people to spend money they do not have yet and then they have to pay interest on the loan.  this could cause them to need another loan to get through the next pay period.
------------------------------


**Use iterative-stratification library**

In [9]:
!uv pip install iterative-stratification

[2mUsing Python 3.13.7 environment at: /home/alumno/py313ml/.venv[0m
[2K[2mResolved [1m6 packages[0m [2min 313ms[0m[0m                                         [0m
[2K[2mPrepared [1m1 package[0m [2min 20ms[0m[0m                                               
[2K[2mInstalled [1m1 package[0m [2min 2ms[0m[0mation==0.1.9                      [0m
 [32m+[39m [1miterative-stratification[0m[2m==0.1.9[0m


In [6]:
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# 1. Create the Input Feature (X) and Targets (y) from your NEW trainval_df
# Concatenate Conclusion + Stance + Premise
trainval_df['text'] = trainval_df['Conclusion'] + " " + trainval_df['Stance'] + " " + trainval_df['Premise']

label_cols = [
    'Self-direction: thought', 'Self-direction: action', 'Stimulation',
    'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources',
    'Face', 'Security: personal', 'Security: societal', 'Tradition',
    'Conformity: rules', 'Conformity: interpersonal', 'Humility',
    'Benevolence: caring', 'Benevolence: dependability',
    'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance'
]

# Create the arrays for splitting
X_all = trainval_df['text'].values
y_all = trainval_df[label_cols].values

print(f"Features shape: {X_all.shape}")
print(f"Labels shape:   {y_all.shape}")

# 2. Iterative Stratified Split (Train vs Test)
# We use X_all and y_all here
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# FIX: Use 'X_all' and 'y_all' inside the loop
for train_index, test_index in msss.split(X_all, y_all):
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all[train_index], y_all[test_index]

print("-" * 30)
print(f"Final Training Set: {X_train.shape[0]} examples (Use for Cross-Validation)")
print(f"Final Test Set:     {X_test.shape[0]} examples (Use for Report)")

# OPTIONAL: Sanity Check
print("\nLabel Distribution Check (First 3 labels):")
print(f"Train: {np.mean(y_train, axis=0)[:3]}")
print(f"Test:  {np.mean(y_test, axis=0)[:3]}")

Features shape: (8865,)
Labels shape:   (8865, 19)
------------------------------
Final Training Set: 7092 examples (Use for Cross-Validation)
Final Test Set:     1773 examples (Use for Report)

Label Distribution Check (First 3 labels):
Train: [0.15595037 0.25747321 0.05217146]
Test:  [0.15566836 0.2571912  0.05188945]
