In [None]:
#Overview on Linear Regression

#Overview on the optimization algorithms

#Cost function, Objective function , Gradient descent

#Understanding of error

#Bias , Variance tradeoff


In [None]:
Linear Regression Characteristics
> Low model complexity
> High bias, low variance
> Does not tend to overfit

In [None]:
Regularization
> It is a method for "constraining" or "regularizing" the size of the coefficients, thus "shrinking" them towards zero.
> It reduces model variance and thus minimizes overfitting.
> If the model is too complex, it tends to reduce variance more than it increases bias, 
resulting in a model that is more likely to generalize.

In [None]:
Ridge regression

L2 regularization > penalty equivalent to square of the magnitude of coefficients

Lasso regression

L1 regularization > penalty equivalent to absolute value of the magnitude of coefficients

In [None]:
A larger alpha (towards the left of each diagram) results in more regularization:


In [None]:
Lasso regression shrinks coefficients all the way to zero, thus removing them from the model

Ridge regression shrinks coefficients toward zero, but they rarely reach zero


In [None]:
> Features should be standardised

In [None]:
# read in the dataset
import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
crime = pd.read_csv(url, header=None, na_values=['?'])
crime.head()

In [None]:
# examine the response variable
crime[127].describe()

In [None]:
# remove categorical features
crime.drop([0, 1, 2, 3, 4], axis=1, inplace=True)

In [None]:
# remove rows with any missing values
crime.dropna(inplace=True)

In [None]:
# check the shape
crime.shape

In [None]:
# define X and y
X = crime.drop(127, axis=1)
y = crime[127]

In [None]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# build a linear regression model
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
# examine the coefficients
print linreg.coef_

In [None]:
# make predictions
y_pred = linreg.predict(X_test)

In [None]:
# calculate RMSE
from sklearn import metrics
import numpy as np
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Ridge Regression
#alpha: must be positive, increase for more regularization
#normalize: scales the features (without using StandardScaler)

In [None]:
# alpha=0 is equivalent to linear regression
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0, normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# try alpha=0.1
ridgereg = Ridge(alpha=0.1, normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# examine the coefficients
print ridgereg.coef_

In [None]:
# create an array of alpha values
alpha_range = 10.**np.arange(-2, 3)
alpha_range

In [None]:
# select the best alpha with RidgeCV
from sklearn.linear_model import RidgeCV
ridgeregcv = RidgeCV(alphas=alpha_range, normalize=True, scoring='mean_squared_error')
ridgeregcv.fit(X_train, y_train)
ridgeregcv.alpha_

In [None]:
# predict method uses the best alpha value
y_pred = ridgeregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
Lasso
alpha: must be positive, increase for more regularization
normalize: scales the features (without using StandardScaler)

In [None]:
# try alpha=0.001 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_

In [None]:
# try alpha=0.001 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_

In [None]:
# calculate RMSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Regularised classification

In [None]:
# read in the dataset
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(url, header=None)
wine.head()

In [None]:
# examine the response variable
wine[0].value_counts()

In [None]:
# define X and y
X = wine.drop(0, axis=1)
y = wine[0]

In [None]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# build a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

In [None]:
# examine the coefficients
print logreg.coef_

In [None]:
# generate predicted probabilities
y_pred_prob = logreg.predict_proba(X_test)
print y_pred_prob

In [None]:
# calculate log loss
print metrics.log_loss(y_test, y_pred_prob)

In [None]:
Logistic regression (regularized)
C: must be positive, decrease for more regularization
penalty: l1 (lasso) or l2 (ridge)


In [None]:
# standardize X_train and X_test
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# try C=0.1 with L1 penalty
logreg = LogisticRegression(C=0.1, penalty='l1')
logreg.fit(X_train_scaled, y_train)
print logreg.coef_

In [None]:
# generate predicted probabilities and calculate log loss
y_pred_prob = logreg.predict_proba(X_test_scaled)
print metrics.log_loss(y_test, y_pred_prob)

In [None]:
# try C=0.1 with L2 penalty
logreg = LogisticRegression(C=0.1, penalty='l2')
logreg.fit(X_train_scaled, y_train)
print logreg.coef_

In [None]:
# generate predicted probabilities and calculate log loss
y_pred_prob = logreg.predict_proba(X_test_scaled)
print metrics.log_loss(y_test, y_pred_prob)

In [None]:
# pipeline of StandardScaler and LogisticRegression
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [None]:
# grid search for best combination of C and penalty
from sklearn.grid_search import GridSearchCV
C_range = 10.**np.arange(-2, 3)
penalty_options = ['l1', 'l2']
param_grid = dict(logisticregression__C=C_range, logisticregression__penalty=penalty_options)
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='log_loss')
grid.fit(X, y)

In [None]:
# print all log loss scores
grid.grid_scores_

In [None]:
# examine the best model
print grid.best_score_
print grid.best_params_