# SPLIT_DATA_TRAIN_VAL_TEST

In [2]:
input_file = "data/input/training.1600000.processed.noemoticon.utf-8.csv"
output_dir = "data/output/preprocessed/train_val_test/"
text_column = "tweet"
label_column = "id"
id_column = "timestamp"
test_size = 0.005
val_size = 0.07
random_state = 42

In [4]:
import papermill as pm
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import sys

# Configurer le logger
def get_logger(name):
    import logging
    logger = logging.getLogger(name)
    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
        logger.addHandler(handler)
    logger.setLevel(logging.INFO)
    return logger

# Obtenir le logger
logger = get_logger(__name__)

def split_trainvaltest(input_path, output_path, text_column, label_column, id_column, test_size=0.2, val_size=0.1, random_state=42):
    logger.info(f"Chargement des données depuis : {input_path}")
    data = pd.read_csv(input_path, header=None, names=["id", "timestamp", "date", "query", "user", "tweet"])
    logger.info(f"Dataset chargé avec {len(data)} lignes.")

    # Vérification des colonnes
    required_columns = [text_column, label_column, id_column]
    for column in required_columns:
        if column not in data.columns:
            logger.error(f"La colonne nécessaire '{column}' est absente.")
            raise ValueError(f"La colonne '{column}' est absente du fichier d'entrée.")
    
    data["id"] = data["id"].apply(lambda x: 1 if x == 4 else x)

    # Suppression des doublons
    if not data[id_column].is_unique:
        logger.warning(f"La colonne '{id_column}' contient des IDs non uniques. Suppression des doublons...")
        data = data.drop_duplicates(subset=[id_column])

    data = data.drop_duplicates(subset=[text_column])

    # Partitionnement train/val/test
    os.makedirs(output_path, exist_ok=True)
    train_val, test = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_column])
    train, val = train_test_split(train_val, test_size=val_size / (1 - test_size), random_state=random_state, stratify=train_val[label_column])

    # Sauvegarde des fichiers
    train[[id_column, text_column]].rename(columns={id_column: "id", text_column: "feature"}).to_csv(os.path.join(output_path, "x_train.csv"), index=False)
    train[[id_column, label_column]].rename(columns={id_column: "id", label_column: "label"}).to_csv(os.path.join(output_path, "y_train.csv"), index=False)
    val[[id_column, text_column]].rename(columns={id_column: "id", text_column: "feature"}).to_csv(os.path.join(output_path, "x_val.csv"), index=False)
    val[[id_column, label_column]].rename(columns={id_column: "id", label_column: "label"}).to_csv(os.path.join(output_path, "y_val.csv"), index=False)
    test[[id_column, text_column]].rename(columns={id_column: "id", text_column: "feature"}).to_csv(os.path.join(output_path, "x_test.csv"), index=False)
    test[[id_column, label_column]].rename(columns={id_column: "id", label_column: "label"}).to_csv(os.path.join(output_path, "y_test.csv"), index=False)

    logger.info(f"Partitions sauvegardées dans {output_path}")


split_trainvaltest(input_file, output_dir, text_column, label_column, id_column, test_size, val_size, random_state)

2025-02-02 12:35:34,276 - INFO - Chargement des données depuis : data/input/training.1600000.processed.noemoticon.utf-8.csv


FileNotFoundError: [Errno 2] No such file or directory: 'data/input/training.1600000.processed.noemoticon.utf-8.csv'