# Preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
sklearn.set_config(print_changed_only=True)

In [None]:
diabetes = pd.read_csv("data/pima_diabetes.csv")
print(diabetes.head())

X = diabetes.drop('class', axis=1)
y = diabetes['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

In [None]:
X.boxplot()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
pd.DataFrame(X_train_scaled, columns=X_train.columns).boxplot()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knr = KNeighborsClassifier().fit(X_train, y_train)
knr.score(X_train, y_train)

In [None]:
knr.score(X_test, y_test)

In [None]:
knr_scaled = KNeighborsClassifier().fit(X_train_scaled, y_train)
knr_scaled.fit(X_train_scaled, y_train)
knr_scaled.score(X_train_scaled, y_train)

In [None]:
X_test_scaled = scaler.transform(X_test)
knr_scaled.score(X_test_scaled, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
rf_scaled = RandomForestClassifier(random_state=42)
rf_scaled.fit(X_train_scaled, y_train)
rf_scaled.score(X_test_scaled, y_test)

# Categorical Variables

In [None]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})
df

In [None]:
pd.get_dummies(df)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
categorical = df.dtypes == object
categorical

In [None]:
~categorical

In [None]:
ct = make_column_transformer((OneHotEncoder(), categorical),
                             (StandardScaler(), ~categorical))
ct.fit_transform(df)

In [None]:
ct = make_column_transformer((OneHotEncoder(sparse=False), categorical))
ct.fit_transform(df)

In [None]:
ct = make_column_transformer((OneHotEncoder(), categorical),
                             remainder='passthrough')
ct.fit_transform(df)

In [None]:
ct = make_column_transformer((OneHotEncoder(), categorical),
                             remainder=StandardScaler())
ct.fit_transform(df)

# Exercises

## Exercise 1
Load the "adult" datasets using consisting of income data from the census, including information whether someone has a salary of less than \$50k or more. Look at the data using the ``head`` method. Our final goal in Exercise 4 will be to classify entries into those making less than \$50k and those that make more.

## Exercise 2
Experiment with visualizing the data. Can you find out which features influence the income the most?

## Exercise 3
Separate the target variable from the features.
Split the data into training and test set.
Apply dummy encoding and scaling.
How did this change the number of variables?

## Exercise 4
Build and evaluate a LogisticRegression model on the data.



In [None]:
data = pd.read_csv("data/adult.csv", index_col=0)

In [None]:
# %load solutions/load_adult.py