## Data preparation

In [1]:
import pandas as pd

In [2]:
adult_census = pd.read_csv('data/adult-census.csv')

In [3]:
from sklearn import set_config

In [4]:
# to display nice model diagram
set_config(display='diagram')

In [5]:
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=target_name)

In [6]:
numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]
data_numeric = data[numerical_columns]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
data_train, data_test, target_train, target_test = train_test_split(data_numeric, target, random_state=42)

## Model fitting with preprocessing

In [9]:
data_train.describe()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
count,36631.0,36631.0,36631.0,36631.0
mean,38.642352,1087.077721,89.665311,40.431247
std,13.725748,7522.692939,407.110175,12.423952
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
scaler.fit(data_train)

In [12]:
scaler.mean_

array([  38.64235211, 1087.07772106,   89.6653108 ,   40.43124676])

In [13]:
scaler.scale_

array([  13.72556083, 7522.59025606,  407.10461772,   12.42378265])

In [14]:
data_train_scaled = scaler.transform(data_train)
data_train_scaled

array([[ 0.17177061, -0.14450843,  5.71188483, -2.28845333],
       [ 0.02605707, -0.14450843, -0.22025127, -0.27618374],
       [-0.33822677, -0.14450843, -0.22025127,  0.77019645],
       ...,
       [-0.77536738, -0.14450843, -0.22025127, -0.03471139],
       [ 0.53605445, -0.14450843, -0.22025127, -0.03471139],
       [ 1.48319243, -0.14450843, -0.22025127, -2.69090725]])

In [15]:
data_train_scaled = scaler.fit_transform(data_train)
data_train_scaled

array([[ 0.17177061, -0.14450843,  5.71188483, -2.28845333],
       [ 0.02605707, -0.14450843, -0.22025127, -0.27618374],
       [-0.33822677, -0.14450843, -0.22025127,  0.77019645],
       ...,
       [-0.77536738, -0.14450843, -0.22025127, -0.03471139],
       [ 0.53605445, -0.14450843, -0.22025127, -0.03471139],
       [ 1.48319243, -0.14450843, -0.22025127, -2.69090725]])

In [16]:
data_train_scaled = scaled = pd.DataFrame(data_train_scaled,
                                   columns=data_train.columns)
data_train_scaled.describe()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
count,36631.0,36631.0,36631.0,36631.0
mean,-1.263553e-16,-1.708425e-15,-1.652358e-15,1.146502e-16
std,1.000014,1.000014,1.000014,1.000014
min,-1.576792,-0.1445084,-0.2202513,-3.173852
25%,-0.7753674,-0.1445084,-0.2202513,-0.03471139
50%,-0.1196565,-0.1445084,-0.2202513,-0.03471139
75%,0.681768,-0.1445084,-0.2202513,0.3677425
max,3.741752,13.14865,10.4797,4.714245


In [17]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [18]:
model = make_pipeline(StandardScaler(), LogisticRegression())
model

In [19]:
model.named_steps

{'standardscaler': StandardScaler(),
 'logisticregression': LogisticRegression()}

In [20]:
start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start

In [21]:
predicted_target = model.predict(data_test)
predicted_target[:5]

array([' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)

In [22]:
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model[-1].n_iter_[0]} iterations")

The accuracy using a Pipeline is 0.807 with a fitting time of 0.076 seconds in 12 iterations


In [23]:
model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start

In [24]:
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model.n_iter_[0]} iterations")

The accuracy using a LogisticRegression is 0.807 with a fitting time of 0.193 seconds in 59 iterations


## Model evaluation using cross-validation

In [26]:
%%time
from sklearn.model_selection import cross_validate

model = make_pipeline(StandardScaler(), LogisticRegression())
cv_result = cross_validate(model, data_numeric, target, cv=5)
cv_result

Wall time: 501 ms


{'fit_time': array([0.07867312, 0.07352448, 0.07422781, 0.0729568 , 0.07420874]),
 'score_time': array([0.01301384, 0.01396108, 0.01369619, 0.01303363, 0.01303363]),
 'test_score': array([0.79557785, 0.80049135, 0.79965192, 0.79873055, 0.80436118])}

In [27]:
scores = cv_result["test_score"]
print("The mean cross-validation accuracy is: " 
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.800 +/- 0.003


In this notebook we have:
<ul>
    <li>seen the importance of scaling numerical variables;</li>
<li>used a pipeline to chain scaling and logistic regression training;</li>
<li>assessed the statistical performance of our model via cross-validation.</li>
</ul>