**Train and val**

In [1]:
import pandas as pd

# 1. Load the Training Data
train_args = pd.read_csv("10564870/arguments-training.tsv", sep='\t')
train_labels = pd.read_csv("10564870/labels-training.tsv", sep='\t')

# 2. Load the Validation Data (Good practice to combine them for your 5-Fold CV)
val_args = pd.read_csv("10564870/arguments-validation.tsv", sep='\t')
val_labels = pd.read_csv("10564870/labels-validation.tsv", sep='\t')

# 3. Merge Arguments with Labels
# We merge on 'Argument ID' to match the text with its tags
df_train_full = pd.merge(train_args, train_labels, on="Argument ID")
df_val_full = pd.merge(val_args, val_labels, on="Argument ID")

# 4. Concatenate to create your "Big Dataset" for the assignment
trainval_df = pd.concat([df_train_full, df_val_full], ignore_index=True)

# 5. Verify the size
print(f"Total Examples: {len(trainval_df)}")
print(trainval_df.head(3))

Total Examples: 7289
  Argument ID                                   Conclusion       Stance  \
0      A01002                  We should ban human cloning  in favor of   
1      A01005                      We should ban fast food  in favor of   
2      A01006  We should end the use of economic sanctions      against   

                                             Premise  Self-direction: thought  \
0  we should ban human cloning as it will only ca...                        0   
1  fast food should be banned because it is reall...                        0   
2  sometimes economic sanctions are the only thin...                        0   

   Self-direction: action  Stimulation  Hedonism  Achievement  \
0                       0            0         0            0   
1                       0            0         0            0   
2                       0            0         0            0   

   Power: dominance  ...  Tradition  Conformity: rules  \
0                 0  ...          

In [2]:
print(len(df_train_full))
print(len(df_val_full))

5393
1896


**Test**

In [3]:
import pandas as pd

# 1. Load Test Arguments and Labels
test_args = pd.read_csv("10564870/arguments-test.tsv", sep='\t')
test_labels = pd.read_csv("10564870/labels-test.tsv", sep='\t')

# 2. Merge them
df_test = pd.merge(test_args, test_labels, on="Argument ID")

# 3. Filter for English (if necessary, consistent with training)
if 'Language' in df_test.columns:
    df_test = df_test[df_test['Language'] == 'en']

# 4. Verify
print(f"Test Set Size: {len(df_test)}")
print(df_test.head(3))

Test Set Size: 1576
  Argument ID                        Conclusion       Stance  \
0      A26004  We should end affirmative action      against   
1      A26010  We should end affirmative action  in favor of   
2      A26016         We should ban naturopathy  in favor of   

                                             Premise  Self-direction: thought  \
0   affirmative action helps with employment equity.                        0   
1  affirmative action can be considered discrimin...                        0   
2  naturopathy is very dangerous for the most vul...                        0   

   Self-direction: action  Stimulation  Hedonism  Achievement  \
0                       0            0         0            1   
1                       0            0         0            1   
2                       0            0         0            1   

   Power: dominance  ...  Tradition  Conformity: rules  \
0                 0  ...          0                  0   
1                 0  

**Examples**

In [6]:
import pandas as pd

# 1. Load the Data
# Using validation set because it's cleaner for inspection
df_args = pd.read_csv("10564870/arguments-validation.tsv", sep='\t')
df_labels = pd.read_csv("10564870/labels-validation.tsv", sep='\t')

# 2. Merge them
val_df = pd.merge(df_args, df_labels, on="Argument ID")

# 3. Identify the Value Columns (The 19 or 20 labels)
# We exclude the text columns to find just the label columns
metadata_cols = ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Language']
value_cols = [col for col in val_df.columns if col not in metadata_cols]

# 4. Display 5 Random Examples
# Change random_state to see different examples
samples = val_df.sample(5, random_state=42) 

for idx, row in samples.iterrows():
    print(f"üÜî ID: {row['Argument ID']}")
    print(f"üì¢ CONCLUSION: {row['Conclusion']}")
    print(f"‚öñÔ∏è STANCE: {row['Stance']}")
    print(f"üìù PREMISE: {row['Premise']}")
    print("-" * 30)
    print("üß† ACTUAL HUMAN VALUES (Ground Truth):")
    
    # Iterate through the columns and print only the ones marked as '1'
    has_values = False
    for val in value_cols:
        if row[val] == 1:
            print(f"   ‚úÖ {val}")
            has_values = True
            
    if not has_values:
        print("   (No values annotated)")
        
    print("=" * 80 + "\n")

üÜî ID: A28426
üì¢ CONCLUSION: Payday loans should be banned
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: payday loans should be banned because it causes people to go into debt
------------------------------
üß† ACTUAL HUMAN VALUES (Ground Truth):
   ‚úÖ Power: resources
   ‚úÖ Security: personal

üÜî ID: A21315
üì¢ CONCLUSION: Homeopathy brings more harm than good
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: introducing items that normally produce symptoms of a disease is something that really could do more harm than good in the long run.
------------------------------
üß† ACTUAL HUMAN VALUES (Ground Truth):
   ‚úÖ Security: personal
   ‚úÖ Universalism: objectivity

üÜî ID: A25015
üì¢ CONCLUSION: Payday loans should be banned
‚öñÔ∏è STANCE: in favor of
üìù PREMISE: payday loans allow people to spend money they do not have yet and then they have to pay interest on the loan.  this could cause them to need another loan to get through the next pay period.
------------------------------


**Prepare features and targets**

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Create the Input Feature (X)
# We glue the parts together to give the model full context
trainval_df['text'] = trainval_df['Conclusion'] + " " + trainval_df['Stance'] + " " + trainval_df['Premise']

# 2. Define the 19 Target Labels (y)
# These are the columns we want to predict
label_cols = [
    'Self-direction: thought', 'Self-direction: action', 'Stimulation',
    'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources',
    'Face', 'Security: personal', 'Security: societal', 'Tradition',
    'Conformity: rules', 'Conformity: interpersonal', 'Humility',
    'Benevolence: caring', 'Benevolence: dependability',
    'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance'
]

X_raw = trainval_df['text'].values
y_raw = trainval_df[label_cols].values

print(f"Features shape: {X_raw.shape}")
print(f"Labels shape: {y_raw.shape}")

# 3. Stratified Split (Train vs Test) - CRITICAL STEP
# Since this is Multi-label, standard stratification is tricky.
# For a baseline assignment, we often stratify based on the *combination* of labels
# or just random split if stratification is too complex.
# However, `iterative_train_test_split` is best, but let's stick to standard sklearn for simplicity
# unless you want to install `iterative-stratification`.

# Simple approach: Use random_state=42 (Reproducibility)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_raw,
    test_size=0.2,          # Standard 80/20 split
    random_state=42,        # FIXED SEED (Mandatory)
    # stratify=y_raw        # This often fails in multi-label if a combination is rare
)

print(f"Training set: {X_train.shape[0]} examples")
print(f"Test set: {X_test.shape[0]} examples")

Features shape: (7289,)
Labels shape: (7289, 19)
Training set: 5831 examples
Test set: 1458 examples
