In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

In [None]:
# --- 1. Importer le dataset ---

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')


In [84]:
y_train = y_train["MathScore"].copy()

In [92]:
# --- 2. Séparer un échantillon pour test rapide ---

# Ici 10 000 lignes pour tester rapidement

X_small, X_test_small, y_small, y_test_small = train_test_split(
    X_train, y_train,
    train_size=0.8,
    test_size=0.2,
    random_state=42
)

In [93]:
# --- 3. Détecter automatiquement les colonnes numériques et catégorielles ---

cat_columns = X_small.select_dtypes(include='object').columns.tolist()
num_columns = X_small.select_dtypes(include=['int64','float64']).columns.tolist()

In [94]:
# --- 4. Créer les pipelines de prétraitement ---

num_transformer = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])

cat_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
transformers=[
('num', num_transformer, num_columns),
('cat', cat_transformer, cat_columns)
],
remainder='drop'
)



In [95]:
# --- 5. Pipeline complet avec LogisticRegression ---

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [96]:
pipeline.fit(X_small, y_small)
r2_score = pipeline.score(X_test_small, y_test_small)
print("R^2 score:", r2_score)

 'math_q19_average_score' 'math_q20_average_score'
 'math_q21_average_score' 'science_q13_average_score'
 'science_q14_average_score' 'science_q15_average_score'
 'science_q16_average_score' 'science_q17_average_score'
 'science_q18_average_score' 'science_q19_average_score'
 'math_q17_total_timing' 'math_q18_total_timing' 'math_q19_total_timing'
 'math_q20_total_timing' 'math_q21_total_timing'
 'science_q13_total_timing' 'science_q14_total_timing'
 'science_q15_total_timing' 'science_q16_total_timing'
 'science_q17_total_timing' 'science_q18_total_timing'
 'science_q19_total_timing']. At least one non-missing value is needed for imputation with strategy='mean'.
 'math_q19_average_score' 'math_q20_average_score'
 'math_q21_average_score' 'science_q13_average_score'
 'science_q14_average_score' 'science_q15_average_score'
 'science_q16_average_score' 'science_q17_average_score'
 'science_q18_average_score' 'science_q19_average_score'
 'math_q17_total_timing' 'math_q18_total_timing' 'mat

R^2 score: 0.3655630516062762
