# Feature selection

- 3 Embedded methods and 1 wrapper method is implemeted in this notebook
- Embedded Methods
    - Random Forest
    - Extra Tree classifier
    - Select from K model
    
- Wrapper methods
    - Permutation Importance

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from configparser import ConfigParser
import os
import json

from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
#Read config.ini file
config = ConfigParser()
config.read("config.ini")
dataset_info = config["DATASETS"]
fm_data_path = dataset_info['fm_path']
rf_data_path = dataset_info['rf_path']
train_set_filename = dataset_info['train_filename']
test_set_filename = dataset_info['test_filename']
description_filename = dataset_info['description_filename']

model_info = config["MODELS"]
model_path = model_info['model_path']
fm_filename = model_info['fm_model_filename']
rf_filename = model_info['rf_model_filename']

fs_info = config["FEATURE_SELECTION"]
fs_fm_path = fs_info['fs_fm_path']
fs_rf_path = fs_info['fs_rf_path']
fs_final_path = fs_info['fs_final_path']

path = dataset_info['ori_path']
genre_cols = pd.read_csv(path + 'ml-100k/u.genre', sep='|', header=None)[0].to_numpy().tolist()

### NOTES:
- Final outputs are saved in this format: 
    
```python
    x_final_dict = {
        "k": None,
        "columns": [], # list type
        "importances": [], # list type, in percentage, and should be sorted
    }
```
- x - model name
- columns length are same as importances length
- importances are in descending order, and columns follow accordingly
- movie_id and user_id columns are excluded before doing feature selection

In [None]:
# create parent folder if doesn't exist
os.makedirs(fs_rf_path, exist_ok=True)

In [None]:
train = pickle.load(open(fm_data_path + train_set_filename, 'rb'))
test = pickle.load(open(fm_data_path + test_set_filename, 'rb'))

### Training data

Cummulative mean rate and mean rate is dropped here. This feature has some problems for generealization. The test data is not properly imputed and this feature gave very worst performance with myfm model. So this feature is removed.

In [None]:
X = train.drop((['rating','cum_mean_rate','mean_rate']), axis = 1)
y = train['rating']

In [None]:
# Save for later reference, can be removed after all experiments
X_original = X.copy(deep = True)

### Test data

In [None]:
X_test = test.drop((['rating','cum_mean_rate','mean_rate']), axis = 1)
y_test = test['rating']

---
Drop user_id and movie_id for feature selection. This is an important feature for the baseline model and we can't remove this feature if it gets a low score. Feature selection is done only for additional features.

In [None]:
X.drop(X.filter(regex='movie_id').columns, axis=1, inplace=True)
X.drop(X.filter(regex='user_id').columns, axis=1, inplace=True)

In [None]:
X_test.drop(X_test.filter(regex='movie_id').columns, axis=1, inplace=True)
X_test.drop(X_test.filter(regex='user_id').columns, axis=1, inplace=True)

---
# Feature Importance
# Feature importance using Random Forest

We train a random forest and then we compute how much each feature is contributing to decrease the mean impurity. The features used at the top of the tree is more imortant than the leaf nodes, as they have high information gain.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# initialize and fit the model
forest = RandomForestClassifier(n_estimators=100)
forest.fit(X, y)

### Testing the model

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# list of column names
feature_names = list(X.columns)

# extract the feature importance values
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
rf_feature_importances = pd.DataFrame(
    {"feature": feature_names, "importance": forest.feature_importances_}
)

In [None]:
rf_feature_importances.sort_values("importance", ascending=False,inplace=True)

In [None]:
rf_final_dict = {
    "k": None,
    "columns": rf_feature_importances['feature'].tolist(),
    "importances": rf_feature_importances['importance'].tolist(),
}

In [None]:
with open(fs_final_path + "random-forest-feature-importance_v3.json", "w") as fp:
    json.dump(rf_final_dict, fp) 

In [None]:
# Visualization is nice to have, but not necessary, might be very large for more number of columns
# visualize the importance of each feature
fig, ax = plt.subplots(figsize=(12,6))
# ax.set_ylim([0, 30])
rf_feature_importances.plot.bar(x='feature', y='importance', ax=ax, legend=False, stacked = True)
ax.set_title("Feature importances")
ax.set_ylabel("Importance in %")
fig.tight_layout()

---

# Feature importance using Extra Tree Cassifier

It is same as random forest. Except it does not use bootstrapping and the cut is not optimum.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
extra_tree = ExtraTreesClassifier()
extra_tree.fit(X,y)

In [None]:
# extract the feature importance values
std = np.std([tree.feature_importances_ for tree in extra_tree.estimators_], axis=0)
et_feature_importances = pd.DataFrame(
    {"feature": feature_names, "importance": extra_tree.feature_importances_}
)

In [None]:
et_feature_importances.sort_values("importance", ascending=False,inplace=True)

In [None]:
# visualize the importance of each feature, might be very large for more number of columns
fig, ax = plt.subplots(figsize=(12,6))
et_feature_importances.plot.bar(x='feature', y='importance', yerr=std, ax=ax, legend=False)
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
# save the info
et_final_dict = {
    "k": None,
    "columns": et_feature_importances['feature'].tolist(),
    "importances": et_feature_importances['importance'].tolist(),
}

with open(fs_final_path + "extra-tree-classifier-feature-importance.json", "w") as fp:
    json.dump(et_final_dict, fp) 

---
# Select from K Model

It uses a base model inside and the weights of the model gives importance about that feature. If the weight is large then that feature is important and if the weight is close to 0 then it is not important.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(X)

In [None]:
for i in range(0 ,550, 50):
    
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'), max_features = i)
    sel_.fit(scaled, y)
    
    selected_feat = X.columns[(sel_.get_support())]
    print('total no of features: {}'.format((X.shape[1])))
    print('No of selected features: {}'.format(len(selected_feat)))
    print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))
    
    # save the info
    et_final_dict = {
        "k": None,
        "columns": selected_feat.tolist(),
        "importances": None,
    }

    with open(fs_final_path + "/sfm/" + "select_from_model_" + str(i) + ".json", "w") as fp:
        json.dump(et_final_dict, fp) 

In [None]:
selected_feat = X.columns[(sel_.get_support())]
print('total no of features: {}'.format((X.shape[1])))
print('No of selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
# save the info
et_final_dict = {
    "k": None,
    "columns": selected_feat.tolist(),
    "importances": None,
}

with open(fs_final_path + "select_from_model_500.json", "w") as fp:
    json.dump(et_final_dict, fp) 

___
# Permutation Importance

A single featue is randomly shuffled each time and we check how it affects the performance. It it not decreasing then that feature is not contributing for the prediction. This is done for all columns and the columns that are not contributing is eliminated. 

In [None]:
from sklearn.inspection import permutation_importance
start_time = time.time()
result = permutation_importance(
    forest, X, y, n_repeats=1
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances.sort_values(ascending=False,inplace=True)

In [None]:
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances.sort_values(ascending=False,inplace=True)

In [None]:
# save the info
et_final_dict = {
    "k": None,
    "columns": forest_importances.index.tolist(),
    "importances": forest_importances.values.tolist(),
}

with open(fs_final_path + "permutation_importance.json", "w") as fp:
    json.dump(et_final_dict, fp) 

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

---

# Mutual Info Regression

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
mutual_info = mutual_info_regression(X, y)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))

In [None]:
mutual_info_sort = mutual_info.sort_values(ascending=False)

In [None]:
# save the info
for k in range(50, 550, 50):
    mi_final_dict = {
        "k": k,
        "columns": mutual_info_sort.index[:k].tolist(),
        "importances": mutual_info_sort.values[:k].tolist(),
    }

    with open(fs_fm_path + "mutual-importance-regression_" + str(k) + ".json", "w") as fp:
        json.dump(mi_final_dict, fp) 

---

# Forward Elimination

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.svm import LinearSVR

In [None]:
model = LinearSVR(random_state=0, tol=1e-05, max_iter=2000)

for k in range(50, 550, 50):
    # Using Forward Elimination
    sfs = SFS(model, 
             n_features_to_select = k,
             direction = 'forward',
             scoring = 'accuracy',
             n_jobs = -1,
             cv=5)

    sfs = sfs.fit(X, y)
    selected_feat_fw = X.columns[(sfs.support_)]
    # save the info
    fw_final_dict = {
        "k": k,
        "columns": selected_feat_fw.tolist(),
        "importances": None,
    }

    with open(fs_fm_path + "forward-elimination_" + str(k) + ".json", "w") as fp:
        json.dump(fw_final_dict, fp) 

---

# Filter Methods

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2, f_regression, mutual_info_regression, r_regression

score_funcs = {
    'f-classif': f_classif,
    'chi2': chi2,
    'f-regression': f_regression,
    'mutual-info-regression': mutual_info_regression,
    'r-regression': r_regression,
}

for name, func in score_funcs.items():
    for k in range(50, 550, 50):
        selector = SelectKBest(func, k=k)
        selector.fit(X, y)
        columns = selector.get_feature_names_out().tolist()

        l = list(zip(scores, columns))
        l.sort(reverse=True)

        scores, columns = zip(*l)

        final_dict = {
            "k": k,
            "columns": columns,
        }

        with open(fs_fm_path + name + "_" + str(k) + ".json", "w") as fp:
            json.dump(final_dict, fp)

In [None]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit(X)
scores = selector.variances_
columns = X.columns

l = list(zip(scores, columns))
l.sort(reverse=True)

scores, columns = zip(*l)

for k in range(50, 550, 50):

    final_dict = {
        "k": k,
        "columns": columns[:k],
        "importances": scores[:k],
    }

    with open(fs_fm_path + "variance-threshold_" + str(k) + ".json", "w") as fp:
        json.dump(final_dict, fp)