# TME4: The goal of the TME is to learn various techniques of feature selection.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn import linear_model
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFdr, chi2

In [2]:
# Golub dataset
X_golub = pd.read_csv('data/Golub_X',sep=' ', header=None).to_numpy() # Observations
y_golub = pd.read_csv('data/Golub_y',sep=' ', header=None).to_numpy()  # Classes
y_golub = np.squeeze(y_golub, axis=1)
X_train_golub, X_test_golub, y_train_golub, y_test_golub = train_test_split(X_golub, y_golub, test_size=0.2, random_state=42, shuffle=True)

In [3]:
# Breast Cancer dataset
data_breast_cancer = pd.read_table('data/BreastDiagnostic.txt',sep=',',header=None)
breast_cancer_X = data_breast_cancer.iloc[:, 2:].to_numpy()
breast_cancer_y = data_breast_cancer.iloc[:, 1].to_numpy()
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(breast_cancer_X, breast_cancer_y, test_size=0.2, random_state=42, shuffle=True)

## A class with different kinds of feature selection methods

In [4]:
class FS():
    def __init__(self, X_train, X_test, y_train, y_test):
        # dataset initialisation
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.new_X_train = None
        self.new_X_test = None
        self.model = None
        
    def feature_selection(self, method, param=0):
        if method == None:
            print("No feature selection")
            self.new_X_train = self.X_train
            self.new_X_test = self.X_test
            
        if method == "VT":
            print(f"TV feature selection with threshold {param['VT_threshold']}")
            sel = VarianceThreshold(threshold=(param["VT_threshold"]))
            self.new_X_train = sel.fit_transform(self.X_train)
            self.new_X_test = sel.transform(self.X_test)
        if method == "univariate":
            sel = SelectFdr(chi2, alpha=param["alpha"])
            self.new_X_train = sel.fit_transform(self.X_train, self.y_train)
            self.new_X_test = sel.transform(self.X_test)

    
    def fit(self, model):
        self.model = model
        self.model.fit(self.new_X_train, self.y_train)
        print(f"Train score: {self.model.score(self.new_X_train, self.y_train)}")
    
    def test(self):
        print(f"Test score: {self.model.score(self.new_X_test, self.y_test)}")
        
    def get_original_feature_number(self):
        print(f"Original feature number: {self.X_train.shape[1]}")
    
    def get_selected_feature_number(self):
        print(f"Selected feature number: {self.new_X_train.shape[1]}")
        
        

In [5]:
golub_fs = FS(X_train_golub, X_test_golub, y_train_golub, y_test_golub)
cancer_fs = FS(X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer)

# 1. A simple heuristic approach is to delete features whose variance is less then a threshold.

## Golub dataset

In [71]:
cls = linear_model.LogisticRegression(random_state=0, penalty=None)

In [43]:
# With no threshold
golub_fs.feature_selection(method=None)
golub_fs.fit(cls)
golub_fs.test()

No feature selection
Train score: 1.0
Test score: 1.0


In [44]:
# With threshold 0.03
golub_fs.feature_selection(method="VT", param={"VT_threshold":0.03})
golub_fs.fit(cls)
golub_fs.test()

TV feature selection with threshold 0.03
Train score: 1.0
Test score: 1.0


In [45]:
# With threshold 0.05
golub_fs.feature_selection(method="VT", param={"VT_threshold":0.05})
golub_fs.fit(cls)
golub_fs.test()

TV feature selection with threshold 0.05
Train score: 1.0
Test score: 1.0


In [78]:
# With threshold 0.06
golub_fs.feature_selection(method="VT", param={"VT_threshold":0.06})
golub_fs.fit(cls)
golub_fs.test()

TV feature selection with threshold 0.06
Train score: 1.0
Test score: 0.8666666666666667


In [79]:
golub_fs.get_original_feature_number()

Original feature number: 3562


In [80]:
golub_fs.get_selected_feature_number()

Selected feature number: 120


## Breast Cancer dataset

In [72]:
cancer_fs.feature_selection(method=None)
cancer_fs.fit(cls)
cancer_fs.test()

No feature selection
Train score: 0.9516483516483516
Test score: 0.956140350877193


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
cancer_fs.feature_selection(method="VT", param={"VT_threshold":0.03})
cancer_fs.fit(cls)
cancer_fs.test()

TV feature selection with threshold 0.03
Train score: 0.9538461538461539
Test score: 0.956140350877193


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
cancer_fs.feature_selection(method="VT", param={"VT_threshold":0.05})
cancer_fs.fit(cls)
cancer_fs.test()

TV feature selection with threshold 0.05
Train score: 0.9384615384615385
Test score: 0.956140350877193


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [82]:
cancer_fs.get_original_feature_number()

Original feature number: 30


In [83]:
cancer_fs.get_selected_feature_number()

Selected feature number: 12


## Analyse:
By using this method based on variable variance, we can significantly reduce the parameters number without having a great degradation of the performance.

# 2. Univariate feature selection 
Use statistical tests to get rid of features which are not statistically significant with respect to the vector of class.

Try the SelectFdr function that computes p-values for an estimated false discovery rate.

### For dataset golub, when we increase alpha it is easily that all features are not selected
Maybe it is due to the fact that information on genes are too noise.

In [123]:
golub_fs.feature_selection(method="univariate", param={"alpha":1})
golub_fs.fit(cls)
golub_fs.test()
golub_fs.get_original_feature_number()
golub_fs.get_selected_feature_number()

Train score: 1.0
Test score: 1.0
Original feature number: 3562
Selected feature number: 3562


In [125]:
golub_fs.feature_selection(method="univariate", param={"alpha":0.9999})
golub_fs.fit(cls)
golub_fs.test()
golub_fs.get_original_feature_number() 
golub_fs.get_selected_feature_number()

Train score: 1.0
Test score: 1.0
Original feature number: 3562
Selected feature number: 3560


### For cancer dataset, we can do a feature selection with this method.

In [127]:
cancer_fs.feature_selection(method="univariate", param={"alpha":1})
cancer_fs.fit(cls)
cancer_fs.test()
cancer_fs.get_original_feature_number() 
cancer_fs.get_selected_feature_number()

Train score: 0.9516483516483516
Test score: 0.956140350877193
Original feature number: 30
Selected feature number: 30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [128]:
cancer_fs.feature_selection(method="univariate", param={"alpha":0.5})
cancer_fs.fit(cls)
cancer_fs.test()
cancer_fs.get_original_feature_number() 
cancer_fs.get_selected_feature_number()

Train score: 0.9428571428571428
Test score: 0.9649122807017544
Original feature number: 30
Selected feature number: 18


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 3. L1 -based feature selection is designed to find an optimal solution.
The sparsity parameter is important (since it controls the number of non-zero parameters: if too many parameters
are kept, no really feature selection; if too few parameters are chosen, it is possible that the
accuracy is very poor).

## Lasso

In [28]:
# golub
linear_regression_lasso = linear_model.LogisticRegression(random_state=0, penalty='l1', solver='saga', C=1)
linear_regression_lasso.fit(X_train_golub, y_train_golub)
print(f"linear logistic regression lasso train score:{linear_regression_lasso.score(X_train_golub, y_train_golub)}")
print(f"linear logistic regression lasso test score:{linear_regression_lasso.score(X_test_golub, y_test_golub)}")

linear logistic regression lasso train score:0.9824561403508771
linear logistic regression lasso test score:1.0




In [29]:
linear_regression_lasso.coef_.sum()

-0.7110541602046173

In [23]:
# with stronger regularization, C Like in support vector machines, smaller values specify stronger regularization.
linear_regression_lasso = linear_model.LogisticRegression(random_state=0, penalty='l1', solver='saga', C=0.1)
linear_regression_lasso.fit(X_train_golub, y_train_golub)
print(f"linear logistic regression lasso train score:{linear_regression_lasso.score(X_train_golub, y_train_golub)}")
print(f"linear logistic regression lasso test score:{linear_regression_lasso.score(X_test_golub, y_test_golub)}")

linear logistic regression lasso train score:0.631578947368421
linear logistic regression lasso test score:0.7333333333333333


In [26]:
linear_regression_lasso.coef_.sum()

0.0

In [8]:
# cancer
linear_regression_lasso = linear_model.LogisticRegression(random_state=0, penalty='l1', solver='saga', C=1)
linear_regression_lasso.fit(X_train_cancer, y_train_cancer)
print(f"linear logistic regression lasso train score:{linear_regression_lasso.score(X_train_cancer, y_train_cancer)}")
print(f"linear logistic regression lasso test score:{linear_regression_lasso.score(X_test_cancer, y_test_cancer)}")

linear logistic regression lasso train score:0.8989010989010989
linear logistic regression lasso test score:0.9473684210526315




In [9]:
linear_regression_lasso = linear_model.LogisticRegression(random_state=0, penalty='l1', solver='saga', C=0.1)
linear_regression_lasso.fit(X_train_cancer, y_train_cancer)
print(f"linear logistic regression lasso train score:{linear_regression_lasso.score(X_train_cancer, y_train_cancer)}")
print(f"linear logistic regression lasso test score:{linear_regression_lasso.score(X_test_cancer, y_test_cancer)}")

linear logistic regression lasso train score:0.8989010989010989
linear logistic regression lasso test score:0.9473684210526315




In [43]:
(linear_regression_lasso.coef_ != 0).sum()

273

# SVM

In [36]:
# golub
model = LinearSVC(C=0.5, penalty="l1", dual=False)
model.fit(X_train_golub, y_train_golub)
print(f"train score:{model.score(X_train_golub, y_train_golub)}")
print(f"test score:{model.score(X_test_golub, y_test_golub)}")

train score:1.0
test score:0.9333333333333333


In [37]:
model = LinearSVC(C=0.1, penalty="l1", dual=False)
model.fit(X_train_golub, y_train_golub)
print(f"train score:{model.score(X_train_golub, y_train_golub)}")
print(f"test score:{model.score(X_test_golub, y_test_golub)}")

train score:0.9824561403508771
test score:0.8666666666666667


In [41]:
(model.coef_ != 0).sum()

7

In [12]:
# cancer
model = LinearSVC(C=0.5, penalty="l1", dual=False)
model.fit(X_train_cancer, y_train_cancer)
print(f"train score:{model.score(X_train_cancer, y_train_cancer)}")
print(f"test score:{model.score(X_test_cancer, y_test_cancer)}")

train score:0.9604395604395605
test score:0.956140350877193




In [49]:
(model.coef_ != 0).sum()

11

In [50]:
model = LinearSVC(C=0.1, penalty="l1", dual=False)
model.fit(X_train_cancer, y_train_cancer)
print(f"train score:{model.score(X_train_cancer, y_train_cancer)}")
print(f"test score:{model.score(X_test_cancer, y_test_cancer)}")

train score:0.9516483516483516
test score:0.956140350877193




In [51]:
(model.coef_ != 0).sum()

11

# Elastic Net

In [44]:
# golub
model = linear_model.LogisticRegression(random_state=0, penalty='elasticnet', C=1, l1_ratio=0.7,  solver='saga')
model.fit(X_train_golub, y_train_golub)
print(f"train score:{model.score(X_train_golub, y_train_golub)}")
print(f"test score:{model.score(X_test_golub, y_test_golub)}")

train score:0.9824561403508771
test score:1.0




In [45]:
(model.coef_!=0).sum()

431

In [46]:
# cancet
model = linear_model.LogisticRegression(random_state=0, penalty='elasticnet', C=1, l1_ratio=0.7,  solver='saga')
model.fit(X_train_cancer, y_train_cancer)
print(f"train score:{model.score(X_train_cancer, y_train_cancer)}")
print(f"test score:{model.score(X_test_cancer, y_test_cancer)}")

train score:0.8989010989010989
test score:0.9473684210526315




In [47]:
(model.coef_!=0).sum()

24

# Conclusion
I find the SVM has relatively better performance on the breast cancer dataset.

two types of elastic Net has a better performance on golub dataset.

I use (model.coef_ != 0).sum() to show the final selected number