In [None]:
%run helper_functions.py
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
sns.set_style("whitegrid")
np.set_printoptions(suppress=True)

# Example 1-D with random data

In [None]:
# create random data
num_random_points = 300
intercept = 20
coefficient = 0.9
random_spread = 20

rng = np.random.RandomState(1)
X = np.sort(100 * rng.rand(num_random_points, 1), axis=0)
y = intercept + coefficient*X 
#add noise
y += random_spread * (0.5 - rng.rand(num_random_points, 1))

#outliers
#y[-1] = -200
#X[-1] = 200

# Plot the random data
plt.figure()
plt.scatter(X, y, c="orange", label="data")
plt.xlabel("data")
plt.ylabel("target")
plt.title("Random Data")
plt.legend()
plt.show()



# Linear regression
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [None]:
from sklearn.linear_model import LinearRegression


regr  = LinearRegression()
regr.fit(X, y)

# Show the model coefficients w
print ('Coefficients: \n', regr.coef_)
print ('Intercept   : \n',  regr.intercept_)

### Predicting on new data

In [None]:
# Test set. We generate 500 numbers from 0-100 and predict their y values
X_test = 100 * rng.rand(500, 1)
y_test_pred = regr.predict(X_test)

# Plot the results
plt.figure()
plt.scatter(X, y, c="orange", label="data")
plt.scatter(X_test, y_test_pred, color="crimson", label="Linear regression", linewidth=1)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Linear regression")
plt.legend()
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()

### Metrics

In [None]:
from sklearn.metrics import mean_squared_error

y_pred =  regr.predict(X)

# The mean squared error (training)
print("Mean squared error on training set: %.2f" %mean_squared_error(y, y_pred))

#Explained Variance score
print('R squared score: %.2f' % regr.score(X, y))

# Example with real data 
- Loading the boston house price dataset
- This dataset has more than one feature (multivariate regression)


In [None]:
# Load the boston dataset from sklearn
from sklearn import datasets
dataset = datasets.load_boston()


In [None]:
X = dataset.data
y = dataset.target

# these data are not shuffled...
from sklearn.utils import shuffle
X, y = sklearn.utils.shuffle(X,y)

In [None]:
pd.DataFrame(X, columns=dataset.feature_names).head(3)

In [None]:
from sklearn.preprocessing import Normalizer
X  = Normalizer().fit_transform(X)

## Cross val

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import *

regr  = LinearRegression()
y_pred = cross_val_predict(regr, X, y, cv=10)

print ("R2  (cross-val) : %.2f"% explained_variance_score(y, y_pred))
print ("MSE (cross-val) : %.2f"% mean_squared_error(y, y_pred))

In [None]:
plt.plot(y_pred, y,'ro', alpha = 0.3)
plt.plot([0,np.max(y)], [0, np.max(y)], 'g-', alpha = 0.6)
plt.xlabel('Predicted')
plt.ylabel('Real')
plt.show()

In [None]:
# Show the model coefficients w
regr.fit(X, y)
print ('Coefficients: \n', regr.coef_, "\n")

In [None]:
print ("\n  F(X) =  %2.3f "% regr.intercept_,  end='' )
for i in range(len(dataset.feature_names)):
    print ("%+2.2f*%s "%(regr.coef_[i], dataset.feature_names[i]) , end='')


## Ridge regression
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [None]:
from sklearn.linear_model import Ridge

regr  = Ridge(alpha = 0.00001)
y_pred = cross_val_predict(regr, X, y, cv=10)

print ("R2  (cross-val) : %.2f"% explained_variance_score(y, y_pred))
print ("MSE (cross-val) : %.2f"% mean_squared_error(y, y_pred))

# Show the model coefficients w
regr.fit(X, y)
print ('Coefficients: \n', regr.coef_, "\n")

## Lasso regression
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

In [None]:
from sklearn.linear_model import Lasso

regr  = Lasso(alpha = 0.001)
y_pred = cross_val_predict(regr, X, y, cv=10)

print ("R2  (cross-val) : %.2f"% explained_variance_score(y, y_pred))
print ("MSE (cross-val) : %.2f"% mean_squared_error(y, y_pred))

# Show the model coefficients w
regr.fit(X, y)
print ('Coefficients: \n', regr.coef_, "\n")

# Logistic regression
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression


In [None]:
from sklearn.datasets import load_digits

digits = load_digits()

X = digits.data
y = digits.target

from sklearn.utils import shuffle
X, y = sklearn.utils.shuffle(X,y)

In [None]:
pd.DataFrame(X[0:10])

In [None]:
image = 301
plt.imshow(X[image].reshape([8,8]))
plt.show()
print ("label = ", y[image])

In [None]:
plot_digits_sample(X,y)

# Grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

params = { 'C': [0.001, 0.01, 0.1, 1]}

model = LogisticRegression()

grid = GridSearchCV(estimator=model, cv = 10, param_grid=params )
grid.fit(X, y)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_pred = cross_val_predict(grid.best_estimator_, X, y, cv=10)
print (classification_report(y, y_pred))
print ("Accuracy: ", accuracy_score(y, y_pred))
cm =  confusion_matrix(y_pred=y_pred, y_true=y, labels=range(0,10))
plot_confusion_matrix(cm, range(0,10))
