In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [3]:
# Prepare train/test data
task = 1

if task == 1:
    folder_path = "train/orientation"
    all_data = []
    
    # Iterate over all .tsv files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.tsv'):
            file_path = os.path.join(folder_path, file_name)
            # Load the current TSV file
            df = pd.read_csv(file_path, sep='\t')
            all_data.append(df)
    
    # Combine all loaded data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
    
    # Ensure no missing values in crucial columns
    combined_data = combined_data.dropna(subset=['label'])
    
    X = combined_data[['id', 'speaker', 'sex','text','text_en']]
    y = combined_data['label']

elif task == 2:
    data_path = "train/power/power-es-train.tsv"
    
    df = pd.read_csv(data_path, sep='\t')
    
    # Ensure no missing values in crucial columns
    df = df.dropna(subset=['label'])
    df = df.dropna(subset=['text_en'])
    df = df.reset_index(drop=True)
    
    X = df[['id', 'speaker', 'sex','text','text_en']]
    y = df['label']

# Perform stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,  # 10% for testing
    random_state=42,  # For reproducibility
    stratify=y  # Ensure the proportion of classes is maintained
)

X_train['label'] = y_train
X_test['label'] = y_test
X_train.to_csv("train_data_orientation.tsv", sep='\t', index=False)
X_test.to_csv("test_data_orientation.tsv", sep='\t', index=False)