In [1]:
import pandas as pd


In [None]:
adult_census = pd.read_csv("../datasets/adult-census.csv")
target = adult_census["class"]
data = adult_census.select_dtypes(["integer", "floating"])
data = data.drop(columns=["education-num"])
data

In [None]:
# Write your code here.
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

mdl = Pipeline([
    ('preprocessing', StandardScaler()),
    ('logistic', LogisticRegression())
]).set_output(transform='pandas')

cv_results = cross_validate(
    estimator = mdl,
    X = data,
    y = target,
    cv = 10,
    return_estimator = True,
    return_train_score = True
)

pd.DataFrame(cv_results)

In [None]:
cv_results['estimator']

What is the most important feature seen by the logistic regression?

You can use a boxplot to compare the absolute values of the coefficients while
also visualizing the variability induced by the cross-validation resampling.

In [None]:
# Write your code here.
model_first_fold = cv_results['estimator'][0]
model_first_fold[-1].feature_names_in_
feature_names = model_first_fold[-1].feature_names_in_
feature_names

import pandas as pd
import numpy as np
coefs = [ est[-1].coef_ for est in cv_results['estimator'] ]
np.concatenate(coefs)

pd_coefs = pd.DataFrame(np.concatenate(coefs), columns=feature_names)
pd_coefs

import matplotlib.pyplot as plt

fig, ax = plt.subplots()

pd_coefs.plot.box(vert=False, ax = ax)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
import numpy as np

# Assuming 'data' is your DataFrame, these are example names:
numerical_features = ['Integer_Col', 'Float_Col', 'Another_Numeric']
categorical_nominal = ['Unordered_Char_Col1', 'Unordered_Char_Col2'] # Needs One-Hot
categorical_ordinal = ['Ordered_Char_Col'] # Needs Ordinal Encoding

# 1. Numerical Pipeline (Standardizing)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Optional: Fill missing numerical data
    ('scaler', StandardScaler())
])

# 2. Nominal Categorical Pipeline (One-Hot Encoding)
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill missing categories
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 3. Ordinal Categorical Pipeline (Ordinal Encoding)
# You can define the order of categories here if needed, or let it infer (risky).
# Example of explicit ordering: categories=[['Low', 'Medium', 'High']]
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [None]:
from sklearn.compose import ColumnTransformer

# The ColumnTransformer takes a list of (name, transformer, columns) tuples
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat_nom', nominal_transformer, categorical_nominal),
        ('cat_ord', ordinal_transformer, categorical_ordinal)
    ],
    remainder='passthrough' # Keep any columns not specified (e.g., ID columns)
)

In [None]:
from sklearn.linear_model import LogisticRegression

# Example: Combine preprocessing with a Logistic Regression model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# Train the entire process on your raw data (X_train, y_train)
full_pipeline.fit(X_train, y_train)

# To make predictions, just pass the raw test data (X_test)
y_pred = full_pipeline.predict(X_test)

Create a predictive model where:
- The numerical data must be scaled.
- The categorical data must be one-hot encoded, set `min_frequency=0.01` to
  group categories concerning less than 1% of the total samples.
- The predictor is a `LogisticRegression`. You may need to increase the number
  of `max_iter`, which is 100 by default.

Use the same 10-fold cross-validation strategy with `return_estimator=True` as
above to evaluate this complex pipeline.

In [None]:
# Write your code here.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

# categorical_ordinal = []
categorical_nominal = ['education', 'sex', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
# categorical_binary  = ['sex']
numerical_feature = ['capital-gain', 'capital-loss', 'hours-per-week']

col_trans = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), numerical_feature),
#         ('ordinal', OrdinalEncoder(), categorical_ordinal),
        ('categorical', OneHotEncoder(min_frequency=0.01), categorical_nominal)
    ],
    remainder='passthrough'
)

mdl = Pipeline(
    [
        ('col_trans', col_trans),
        ('logistic', LogisticRegression(max_iter=250))
    ]
)

cv_results = cross_validate(
    estimator = mdl,
    X = data,
    y = target,
    cv = 10,
    return_estimator = True,
    return_train_score = True
)


By comparing the cross-validation test scores of both models fold-to-fold,
count the number of times the model using both numerical and categorical
features has a better test score than the model using only numerical features.

In [None]:
# Write your code here.

For the following questions, you can copy and paste the following snippet to
get the feature names from the column transformer here named `preprocessor`.

```python
preprocessor.fit(data)
feature_names = (
    preprocessor.named_transformers_["onehotencoder"].get_feature_names_out(
        categorical_columns
    )
).tolist()
feature_names += numerical_columns
feature_names
```

In [None]:
# Write your code here.