# Linear Models
In this Notebook, I will focus more on Linear models and their applications using sklearn tool kit 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split

In [None]:
## first example: uing Logitic regression and SVM
from sklearn.linear_model import LogisticRegression
digits = datasets.load_digits()
print(type(digits.data))
print(digits.data[0])

data = digits.data
target = digits.target

# import scaler 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# target_scaled = scaler.transform(target)

X_train, X_test, y_train, y_test = train_test_split(data_scaled, target)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

# we can either use predict()
y_pred = lr.predict(X_test) # this will return the final classification determined by the treshhold
y_prob_pred = lr.predict_proba(X_test)

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print(svm.score(X_train, y_train))
print(svm.score(X_test, y_test))

In [None]:
# let's consider the different effects regularization have on Logistic Regression
# we have our scaled data set to scaled_data and the target values set to target
random_state = 23
X_1, X_test, y1, y_test = train_test_split(data_scaled, target, test_size=0.2, random_state=random_state)
# now X_1 represents both cross validation and training sets 
# let's divide them: As we want the validation set to represent 0.2 of the original dataset, we need 0.25 out of the (train+validatation) dataset

X_train, X_val, y_train, y_val = train_test_split(X_1, y1, test_size=0.25, random_state=random_state)

log_reg = None
# C_index represents the 1/lambda where lambda is the regularization parameter
lambda_reg = np.random.rand(20) * 0.100005 # 20 random values that belong to the interval [0, 0.15[
lambda_reg = np.sort(lambda_reg)
print(lambda_reg)
val_errors = []
train_errors = []
models = []
solver = 'liblinear'
for c in lambda_reg:
    log_reg = LogisticRegression(C=1 / c, solver=solver).fit(X_train, y_train) # setting the regularization hyperparmeter
    models.append(log_reg) # saving the model for later use 
    train_errors.append(1 - log_reg.score(X_train, y_train)) # saving the train errors
    val_errors.append(1 - log_reg.score(X_val, y_val)) # saving the validation errors

fig, ax = plt.subplots()
ax.plot(lambda_reg, train_errors,'-b', label='train error')
ax.plot(lambda_reg, val_errors, '--r', label='validation error')
leg = ax.legend();
plt.show()

In [None]:
index_min = np.argmin(val_errors)
print(models[index_min].score(X_test, y_test))

In [None]:
# the cells above are using the default parameters: the l2: regularization using the sum of square errors.
# the second penalty or regularization techniques employed with Linear models is "l1" which uses the sum of errors (abosulte differences)
# "L1" is usually referred to as features selection while L2 is referred to as shrinkage.

from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(penalty='l1', solver=solver)

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C':[0.001, 0.01, 0.1, 1, 10]})
searcher.fit(X_train, y_train)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features:", np.count_nonzero(coefs))

In [None]:
## The multiclass classification
## There are two main techniques to extend Logistic Regresison to multi-class classification problems
## the first and default is One-vs-rest, where the model is trained to build k models, where each model tackles the bnary
## classification problem (y==k) and then the final classification is the one with the highest confidence.

## the second is the multinomial approach and tackles the problem directly: the cost function is modified to fit such purpose. 

# Fit one-vs-rest logistic regression classifier
lr_ovr = LogisticRegression()
lr_ovr.fit(X_train, y_train)

print("OVR training accuracy:", lr_ovr.score(X_train, y_train))
print("OVR test accuracy    :", lr_ovr.score(X_test, y_test))

# Fit softmax classifier
lr_mn = LogisticRegression(solver="lbfgs", multi_class="multinomial") # requires setting the parameters as follows
lr_mn.fit(X_train, y_train)

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))

In [None]:
## IT is important to note that Logistic Regression is not the idea tool for multi-class classification.
## assuming we have classifier log_reg, if the classifier log_reg_class_k = log_reg.fit(X_train, y_train == k)
## is classifying the class k poorly then the complete model would find great diffculties classifying any example correctly.  

In [None]:
from sklearn.svm import SVC
wine = datasets.load_wine()

X = wine.data
y = wine.target
X = X[:, :2] # consider only the two first features

svm = SVC(kernel='linear')
svm.fit(X, y)

print("total number of samples {}".format(len(X)))
print("number of support vectors {}".format(len(svm.support_)))

print(svm.support_)
X_small = X[svm.support_, :]
y_small = y[svm.support_]

svm_small = SVC(kernel='linear').fit(X_small, y_small)

X_random = np.random.rand(100, 2)

y_pred = svm.predict(X_random)
y_pred_small = svm.predict(X_random)
print((y_pred == y_pred_small).all())

# the support vector machine learns the same decision boundaries for the two datasets. 

In [None]:
## the non-linear are more powerful than linear ones. One way to think about the RBF SVM (the default setting) is that it conducts complex
## transformations on the linear data.T

# # Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]} # this parameter determines the complexity of the boundary: 
# larger gamma values mean more sensitivity, and large possibility of overfitting.
searcher = GridSearchCV(svm, parameters)
searcher.fit(X, y)

# Report the best parameters
print("Best CV params", searcher.best_params_)
 

In [35]:
# it is crucial to point out that Sklearn offers an additional class called  SGDClassifier. This is a linear classifier that used stochastic
# gradient descent as its main optimization algorithm which scaled quite well with large datasets by design.
# we can have SVM and LogReg models that use SGD as their optimization algorithm using this class.

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV

linear_classifier = SGDClassifier(random_state=0)

# svm with SGD = SGDC(loss = 'hinge')
# LogReg with SGD = SGCD(loss = 'log')

# Instantiate the GridSearchCV object and run the search
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':["hinge", "log_loss"], 'penalty':["l1", "l2"]}
searcher = RandomizedSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:", searcher.score(X_test, y_test))

Best CV params {'penalty': 'l2', 'loss': 'log_loss', 'alpha': 1e-05}
Best CV accuracy 0.9414676358601592
Test accuracy of best grid search hypers: 0.95
