# Basics of Supervised Learning
- Learn from (input, output) pairs
- Generalize to new input, predict unknown output

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook

In [None]:
from sklearn.datasets import load_digits
import numpy as np
digits = load_digits()
digits.keys()

In [None]:
digits.data.shape

In [None]:
digits.target.shape

In [None]:
digits.target

In [None]:
np.bincount(digits.target)

In [None]:
plt.matshow(digits.data[0].reshape(8, 8), cmap=plt.cm.Greys)

In [None]:
digits.target[0]

In [None]:
fig, axes = plt.subplots(4, 4)
for x, y, ax in zip(digits.data, digits.target, axes.ravel()):
    ax.set_title(y)
    ax.imshow(x.reshape(8, 8), cmap="gray_r")
    ax.set_xticks(())
    ax.set_yticks(())
plt.tight_layout()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target)

Really Simple API
-------------------
0) Import your model class

In [None]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters

In [None]:
svm = LinearSVC()

2) Fit the model

In [None]:
svm.fit(X_train, y_train)

3) Apply / evaluate

In [None]:
print(svm.predict(X_train))
print(y_train)

In [None]:
svm.score(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

And again
---------

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

# Exercise
Load the iris dataset from seaborn using

```python
iris = sns.load_dataset("iris")
```
Visualize the dataset. Extract the features (independent variables) and the target (dependent variable).
Split it into training and test set using ``train_test_split``.


Then train an evaluate a classifier of your choice. Try ``sklearn.neighbors.KNeighborsClassifier`` or ``sklearn.ensemble.RandomForestClassifier`` for example.


In [None]:
# your solution

# Dummy encoding of categorical variables

In [None]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})
df

In [None]:
pd.get_dummies(df)

In [None]:
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': [0, 1,0, 2, 2, 3]})
df

In [None]:
pd.get_dummies(df)

# Exercise
Load the california housing data from data/housing.csv and apply dummy encoding.

In [None]:
# solution

# Scaling data

In [None]:
import seaborn.apionly as sns
iris = sns.load_dataset("iris")
iris.head()

In [None]:
X = iris.iloc[:, :-1]  # could do iris.pop("species") but that is changing "iris"
y = iris.species
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
X_train_scaled[:10]

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
svm = LinearSVC()
svm.fit(X_train_scaled, y_train)

In [None]:
svm.predict(X_test_scaled)

In [None]:
svm.score(X_test_scaled, y_test)

# Exercise
- load the california housing data and drop columns with missing values
- Separate features and target in the california housing dataset (with dummy encoding)
- use train_test_split to split it into training and test data
- use the StandardScaler to scale training and test data
- Fit the sklearn.linear_modle.Ridge model (ridge regression, a linear regression model) and evaluate it on the test data.

Note: the score method computes the R^2 for regression problems

In [None]:
# solution here

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
ridge = Ridge()

ridge.fit(X_train_scaled, y_train)
print(ridge.score(X_test_scaled, y_test))
print(ridge.score(X_train_scaled, y_train))

In [None]:
# Inspecting the ridge model

In [None]:
X_train.columns

In [None]:
ridge.coef_

In [None]:
plt.figure()
plt.barh(range(len(ridge.coef_)), ridge.coef_)
plt.yticks(range(len(ridge.coef_)), X_train.columns);