In [80]:
import pandas as pd
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

In [2]:
df = pd.read_csv("student-por.csv")

In [15]:
# we take a sample
dt = df.iloc[:10][["school","sex","age","Medu","G3"]]

In [89]:
dt

Unnamed: 0,school,sex,age,Medu,G3
0,GP,F,18,4,11
1,GP,F,17,1,11
2,GP,F,15,1,12
3,GP,F,15,4,14
4,GP,F,16,3,13
5,GP,M,16,4,13
6,GP,M,16,2,13
7,GP,F,17,4,13
8,GP,M,15,3,17
9,GP,M,15,3,13


In [24]:
# we separate the features and target
X = dt.drop("G3", axis=1)
y = dt["G3"]

In [40]:
X

Unnamed: 0,school,sex,age,Medu
0,GP,F,18,4
1,GP,F,17,1
2,GP,F,15,1
3,GP,F,15,4
4,GP,F,16,3
5,GP,M,16,4
6,GP,M,16,2
7,GP,F,17,4
8,GP,M,15,3
9,GP,M,15,3


<h2>First approach

In [98]:

ohe = OneHotEncoder() 
std = StandardScaler()
ct = make_column_transformer(
    (ohe, ["school", "sex"]), # ohe only on the categorical variables 
    (std, ["age","Medu"]), # std only on the numerical variable 
    remainder='passthrough') # passthrough the features that we don't use 
ct.fit(X)

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder', OneHotEncoder(),
                                 ['school', 'sex']),
                                ('standardscaler', StandardScaler(),
                                 ['age', 'Medu'])])

In [70]:
# We have done only the standard scaler on the age and Medu we want a standard scaler on all the dataset 
ct.fit_transform(X)

array([[ 1.        ,  1.        ,  0.        ,  2.        ,  0.968496  ],
       [ 1.        ,  1.        ,  0.        ,  1.        , -1.67285672],
       [ 1.        ,  1.        ,  0.        , -1.        , -1.67285672],
       [ 1.        ,  1.        ,  0.        , -1.        ,  0.968496  ],
       [ 1.        ,  1.        ,  0.        ,  0.        ,  0.08804509],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.968496  ],
       [ 1.        ,  0.        ,  1.        ,  0.        , -0.79240582],
       [ 1.        ,  1.        ,  0.        ,  1.        ,  0.968496  ],
       [ 1.        ,  0.        ,  1.        , -1.        ,  0.08804509],
       [ 1.        ,  0.        ,  1.        , -1.        ,  0.08804509]])

<h1> With pipeline

In [78]:
ctt = make_column_transformer(
    (ohe, ["school", "sex"]),
    remainder='passthrough')

In [83]:
# Separe in training set and testing set 
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(8, 4) (2, 4) (8,) (2,)


In [97]:
from sklearn.pipeline import make_pipeline

# run before ctt that transfort the school and sex in oneh
pipe = make_pipeline(ctt, StandardScaler())

pipe.fit_transform(X_train)

array([[ 0.        , -1.29099445,  1.29099445, -0.25819889,  0.83074716],
       [ 0.        , -1.29099445,  1.29099445, -0.25819889, -1.06810349],
       [ 0.        ,  0.77459667, -0.77459667, -1.29099445,  0.83074716],
       [ 0.        ,  0.77459667, -0.77459667,  0.77459667, -2.01752882],
       [ 0.        ,  0.77459667, -0.77459667,  1.80739223,  0.83074716],
       [ 0.        ,  0.77459667, -0.77459667,  0.77459667,  0.83074716],
       [ 0.        ,  0.77459667, -0.77459667, -0.25819889, -0.11867817],
       [ 0.        , -1.29099445,  1.29099445, -1.29099445, -0.11867817]])