In [2]:
import pandas as pd
cols = [
    "status","duration","credit_history","purpose","amount",
    "savings","employment","installment_rate","personal_status",
    "guarantors","residence","property","age","other_installment",
    "housing","credits","job","dependents","phone","foreign_worker",
    "target"
]
df = pd.read_csv("../data/german.data", sep=" ", names=cols)
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment,installment_rate,personal_status,guarantors,...,property,age,other_installment,housing,credits,job,dependents,phone,foreign_worker,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [3]:
df["target"].value_counts(normalize=True)

target
1    0.7
2    0.3
Name: proportion, dtype: float64

In [8]:
df["target_binary"] = df["target"].map({1: 0, 2: 1})

In [9]:
df[["target", "target_binary"]].head()

Unnamed: 0,target,target_binary
0,1,0
1,2,1
2,1,0
3,1,0
4,2,1


In [10]:
X = df.drop(columns=["target", "target_binary"])
y = df["target_binary"]

X.shape, y.shape

((1000, 20), (1000,))

In [11]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

cat_cols, num_cols

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include="object").columns


(Index(['status', 'credit_history', 'purpose', 'savings', 'employment',
        'personal_status', 'guarantors', 'property', 'other_installment',
        'housing', 'job', 'phone', 'foreign_worker'],
       dtype='str'),
 Index(['duration', 'amount', 'installment_rate', 'residence', 'age', 'credits',
        'dependents'],
       dtype='str'))

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((800, 20), (200, 20))

In [15]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape

((800, 61), (200, 61))