In [2]:
import numpy as np
import pandas as pd

In [3]:
# Importation of the dataset
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")

In [3]:
# Tronquer y_train pour correspondre à X_train
y_train = y_train[:len(X_train)]
y_train = y_train.iloc[:, 0]  # prendre seulement la première colonne

# Vérifier
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (1172086, 307)
y_train shape: (1172086,)


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# 0 : on ne prend que la profession de la mère

# Identifier les lignes sans NaN dans X_test['ma_var']
mask = X_train['OCOD1'].notna()

# Filtrer X_test et y_test
X_train_filtered = X_train[mask]
y_train_filtered = y_train[mask]

X_train_mother = X_train_filtered["OCOD1"].copy()
y_train_mother = y_train_filtered

print("X_test_filtered shape:", X_train_mother.shape)
print("y_test_filtered shape:", y_train_mother.shape)

X_train_mother = X_train_mother.to_frame()

X_test_filtered shape: (823967,)
y_test_filtered shape: (823967,)


In [6]:
# --- 1. Assurer que y_train est 1D ---
if isinstance(y_train_mother, pd.DataFrame):
    y_train_mother = y_train_mother.iloc[:, 0]  # prendre la première colonne si y est un DataFrame
elif isinstance(y_train_mother, np.ndarray) and y_train_mother.ndim > 1:
    y_train_mother = y_train_mother.ravel()     # convertir en 1D si c'est un array NumPy

print("y_train shape:", y_train_mother.shape)  # Vérification

y_train shape: (823967,)


In [7]:
# --- 2. Séparer automatiquement les colonnes numériques et catégorielles ---
cat_columns_mothers = X_train_mother.select_dtypes(include='object').columns.tolist()
num_columns_mothers = X_train_mother.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [10]:
# --- 3. Pipelines de prétraitement ---

if X_tr_mother.shape[1] == 1:
    col_name = X_tr_mother.columns[0]
    if X_tr_mother.dtypes[0] == 'object':
        cat_columns_mothers = [col_name]
        num_columns_mothers = []
    else:
        cat_columns_mothers = []
        num_columns_mothers = [col_name]

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_columns_mothers),
        ('cat', cat_transformer, cat_columns_mothers)
    ],
    remainder='drop'  # pas de colonne restante
)

  if X_tr_mother.dtypes[0] == 'object':


In [9]:
# --- 4. Split train/test ---
X_tr_mother, X_test_mother, y_tr_mother, y_test_mother = train_test_split(X_train_mother, y_train_mother, test_size=0.2, random_state=42)

In [11]:
# --- 5. Pipeline complète avec LogisticRegression ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])

In [12]:
# Si X_tr_mother est une Series
if isinstance(X_tr_mother, pd.Series):
    X_tr_mother = X_tr_mother.to_frame()

In [1]:
print("Type de X_tr_mother:", type(X_tr_mother))
print("Shape de X_tr_mother:", X_tr_mother.shape)
print("Type de y_tr_mother:", type(y_tr_mother))
print("Shape de y_tr_mother:", getattr(y_tr_mother, 'shape', 'N/A'))

print("Nombre de catégories uniques:", X_tr_mother.iloc[:, 0].nunique())

print("NaN dans X_tr_mother:", X_tr_mother.isna().sum())
print("NaN dans y_tr_mother:", y_tr_mother.isna().sum())

NameError: name 'X_tr_mother' is not defined

In [None]:
# --- 6. Entraîner le modèle ---
pipeline.fit(X_tr_mother, y_tr_mother)

In [None]:
# --- 7. Évaluer ---
score = pipeline.score(X_test, y_test)
print("Test set accuracy:", score)