# Linear models for classification

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

sns.set_style('darkgrid')

%matplotlib inline

## Logistic Regression

In [31]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LogisticRegression(C=1e6))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.97826087 0.95652174 0.95652174 0.91304348 1.         1.
 0.95555556 0.97777778 0.93333333 0.95555556]
0.962657004830918


## Lasso Logistic Regression 

In [35]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LogisticRegression('l1', C=0.1, solver= 'liblinear'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[1.         0.97826087 0.97826087 0.93478261 0.97826087 0.97777778
 0.97777778 0.95555556 0.95555556 0.95555556]
0.9691787439613526


## Lasso Logistic Regression - hyperparameter tuning

In [40]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))

pipe = make_pipeline(preprocess,
                     GridSearchCV(LogisticRegression(),
                                  param_grid = [{"penalty":['l1'],
                                                 "solver":["liblinear"],
                                                 "C":np.logspace(-3, 3, 10)}],
                                  return_train_score=True))

pipe.fit(dev_X, dev_y)
grid_search_results = pipe.named_steps["gridsearchcv"]
print(f'Best score:', grid_search_results.best_score_)
print(f'Best alpha:', grid_search_results.best_params_)    
print(f"Test score:", pipe.score(test_X, test_y))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
Best score: 0.9758241758241759
Best alpha: {'C': 0.46415888336127775, 'penalty': 'l1', 'solver': 'liblinear'}
Test score: 0.9736842105263158


## Ridge Logistic Regression

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LogisticRegression('l2', C=0.1, solver= 'liblinear'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[1.         0.97826087 0.97826087 0.95652174 0.97826087 1.
 0.97777778 0.97777778 0.97777778 0.97777778]
0.9802415458937197


## Ridge Logistic Regression - hyperparameter tuning

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))

pipe = make_pipeline(preprocess,
                     GridSearchCV(LogisticRegression(),
                                  param_grid = [{"penalty":['l2'],
                                                 "solver":["liblinear"],
                                                 "C":np.logspace(-3, 3, 10)}],
                                  return_train_score=True))

pipe.fit(dev_X, dev_y)
grid_search_results = pipe.named_steps["gridsearchcv"]
print(f'Best score:', grid_search_results.best_score_)
print(f'Best alpha:', grid_search_results.best_params_)    
print(f"Test score:", pipe.score(test_X, test_y))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
Best score: 0.9780219780219781
Best alpha: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Test score: 0.9912280701754386


## Elastic Net Logistic Regression

In [5]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LogisticRegression('elasticnet', C=0.1, solver= 'saga', l1_ratio=0.5))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[1.         0.97826087 0.97826087 0.95652174 0.97826087 1.
 0.97777778 0.97777778 0.97777778 0.91111111]
0.9735748792270531


## Elastic Net Logistic Regression - hyperparameter tuning

In [12]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))

pipe = make_pipeline(preprocess,
                     GridSearchCV(LogisticRegression(),
                                  param_grid = [{"penalty":['elasticnet'],
                                                 "solver":["saga"],
                                                 "C":np.logspace(-3, 3, 10),
                                                "l1_ratio": np.linspace(0, 1, 10)}],
                                  return_train_score=True))

pipe.fit(dev_X, dev_y)
grid_search_results = pipe.named_steps["gridsearchcv"]
print(f'Best score:', grid_search_results.best_score_)
print(f'Best alpha:', grid_search_results.best_params_)    
print(f"Test score:", pipe.score(test_X, test_y))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
Best score: 0.9780219780219781
Best alpha: {'C': 2.154434690031882, 'l1_ratio': 0.1111111111111111, 'penalty': 'elasticnet', 'solver': 'saga'}
Test score: 0.9824561403508771


## Soft-Margin SVMs (primal)

In [15]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LinearSVC(C=0.1, loss="hinge"))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.97826087 0.97826087 0.97826087 0.95652174 0.97826087 1.
 0.97777778 0.97777778 0.95555556 1.        ]
0.9780676328502415


## Soft-Margin SVMs (dual)

In [18]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=0.1, kernel='linear'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.97826087 0.97826087 0.97826087 0.95652174 0.97826087 1.
 0.97777778 0.97777778 0.95555556 1.        ]
0.9780676328502415


## Soft-Margin SVMs - hyperparameter tuning

In [20]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))

pipe = make_pipeline(preprocess,
                     GridSearchCV(SVC(),
                                  param_grid = {"kernel":['linear'],
                                                 "C":np.logspace(-3, 3, 20)},                                                          
                                  return_train_score=True))

pipe.fit(dev_X, dev_y)
grid_search_results = pipe.named_steps["gridsearchcv"]
print(f'Best score:', grid_search_results.best_score_)
print(f'Best alpha:', grid_search_results.best_params_)    
print(f"Test score:", pipe.score(test_X, test_y))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
Best score: 0.9758241758241759
Best alpha: {'C': 0.0379269019073225, 'kernel': 'linear'}
Test score: 0.9824561403508771


In [21]:
grid_search_results.best_estimator_.dual_coef_

array([[-0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 ,
        -0.03177964, -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 ,
        -0.0379269 , -0.0379269 , -0.03527029, -0.0379269 , -0.0379269 ,
        -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 ,
        -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 ,
        -0.0379269 , -0.0379269 , -0.00737361, -0.0379269 , -0.0379269 ,
        -0.0379269 , -0.0379269 , -0.0379269 , -0.0379269 ,  0.01497148,
         0.00997104,  0.0379269 ,  0.0379269 ,  0.0379269 ,  0.02522742,
         0.0379269 ,  0.0379269 ,  0.0379269 ,  0.02246217,  0.0379269 ,
         0.0379269 ,  0.0379269 ,  0.0379269 ,  0.0379269 ,  0.0379269 ,
         0.0379269 ,  0.0379269 ,  0.00028438,  0.0379269 ,  0.0379269 ,
         0.0379269 ,  0.0379269 ,  0.0379269 ,  0.0379269 ,  0.0379269 ,
         0.0379269 ,  0.03464645,  0.0379269 ,  0.0379269 ,  0.0379269 ,
         0.0379269 ,  0.02857075,  0.0379269 ,  0.0

In [22]:
grid_search_results.best_estimator_.support_vectors_

array([[ 0.32910211,  0.75680107,  0.28967589, ...,  0.12899967,
         0.42798088,  0.77697663],
       [ 0.27813844,  0.66528001,  0.22134988, ...,  0.59486965,
        -0.36464082, -0.28818525],
       [ 0.28663239,  2.49335462,  0.19871174, ..., -0.74220784,
         0.51992499, -1.24621394],
       ...,
       [ 0.24699397,  0.68640025,  0.23246315, ...,  0.25466197,
         0.40895795,  0.48474423],
       [ 1.05675009, -1.397464  ,  0.93506863, ..., -0.47157416,
        -1.77867793, -1.41112052],
       [-0.27679932,  0.36490318, -0.24293775, ...,  0.06003866,
        -0.54218808, -0.11935232]])

In [24]:
grid_search_results.best_estimator_.support_vectors_.shape

(71, 30)

## Hard-margin SVMs

In [25]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=1e8, kernel='linear'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.97826087 0.95652174 0.93478261 0.93478261 1.         1.
 0.95555556 0.93333333 0.91111111 0.95555556]
0.9559903381642514


## Kernel SVMs

In [26]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=1.0, kernel= 'rbf', gamma=1.0))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.63043478 0.63043478 0.63043478 0.63043478 0.63043478 0.64444444
 0.62222222 0.62222222 0.62222222 0.62222222]
0.6285507246376811


In [27]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=1.0, kernel= 'rbf', gamma=0.1))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
[0.93478261 0.97826087 0.97826087 0.95652174 0.97826087 0.97777778
 0.97777778 0.95555556 0.97777778 0.86666667]
0.9581642512077295


## Kernel SVMs - hyperparameter tuning

In [28]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
feature_names = data.feature_names
bc_df = pd.DataFrame(data.data, columns = feature_names)
target = pd.Series(data.target)
print('Class distribution:')
print(target.value_counts())
print("Dataset size:")
print(bc_df.shape)
dev_X, test_X, dev_y, test_y = train_test_split(bc_df, target,
                                                 test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))

pipe = make_pipeline(preprocess,
                     GridSearchCV(SVC(),
                                  param_grid = {"kernel":['rbf'],
                                                 "C":np.logspace(-3, 3, 20),
                                               "gamma":np.logspace(-3, 1, 10)},                                                          
                                  return_train_score=True))

pipe.fit(dev_X, dev_y)
grid_search_results = pipe.named_steps["gridsearchcv"]
print(f'Best score:', grid_search_results.best_score_)
print(f'Best alpha:', grid_search_results.best_params_)    
print(f"Test score:", pipe.score(test_X, test_y))

Class distribution:
1    357
0    212
dtype: int64
Dataset size:
(569, 30)
Best score: 0.9780219780219781
Best alpha: {'C': 2.976351441631316, 'gamma': 0.007742636826811269, 'kernel': 'rbf'}
Test score: 0.9824561403508771


## Multiclass classification 

In [32]:
def quality_to_label(x):
    if x <= 4:
        return 'poor'
    elif x > 4 and x <=6:
        return 'good'
    elif x > 6 and x <=8:
        return 'better'
    else:
        return 'best'

data_path = "Data/winequality-red.csv"
wine_df = pd.read_csv(data_path)
wine_df["label"] = wine_df ["quality" ].apply (lambda x: quality_to_label(x))
print (wine_df.shape)
print (wine_df.columns)
wine_df.label.value_counts()

(1599, 13)
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'label'],
      dtype='object')


good      1319
better     217
poor        63
Name: label, dtype: int64

In [37]:
feature_names = ['fixed acidity',
                 'volatile acidity',
                 'citric acid',
                 'residual sugar', 
                 'chlorides',
                 'free sulfur dioxide',
                 'total sulfur dioxide',
                 'density',
                 'pH',
                 'sulphates',
                 'alcohol',
]
wine_feature_df = wine_df[feature_names]
wine_target = wine_df["label"]
print("Dataset size:")
print(wine_feature_df.shape)

Dataset size:
(1599, 11)


## OVR v.s. OVO

In [39]:
dev_X, test_X, dev_y, test_y = train_test_split(wine_feature_df, wine_target,
                                                test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=0.1, kernel='linear', decision_function_shape='ovr'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

[0.828125   0.828125   0.828125   0.828125   0.828125   0.828125
 0.828125   0.8203125  0.8203125  0.82677165]
0.8264271653543307


In [40]:
dev_X, test_X, dev_y, test_y = train_test_split(wine_feature_df, wine_target,
                                                test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, SVC(C=0.1, kernel='linear', decision_function_shape='ovo'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

[0.828125   0.828125   0.828125   0.828125   0.828125   0.828125
 0.828125   0.8203125  0.8203125  0.82677165]
0.8264271653543307


## Multinomial Logistic Regression

In [41]:
dev_X, test_X, dev_y, test_y = train_test_split(wine_feature_df, wine_target,
                                                test_size=0.2, random_state=42)
preprocess = make_column_transformer((StandardScaler(), feature_names))
pipe = make_pipeline(preprocess, LogisticRegression('l2', C=1.0, multi_class='multinomial'))
scores = cross_val_score(pipe, dev_X, dev_y, cv=10, error_score="raise")
print(scores)
print(np.mean(scores))

[0.8515625  0.8359375  0.8515625  0.875      0.8515625  0.8046875
 0.8515625  0.8359375  0.84375    0.85826772]
0.8459830216535433
