In [None]:
# FEATURE SELECTION
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# DATASET
# --- Import Dataset 1
dataset = pd.read_csv('11-df_coffee/dataset.csv')
dataset.head(10)
dataset.shape
# --- Changing pandas dataframe to numpy array to determine X and y variables
X = dataset.iloc[:,:85]#.values
y = dataset.iloc[:,85:86]#.values
(_,Xcol) = X.shape
num_features = Xcol
print(num_features)
# --- Normalizing the data
sc = StandardScaler()
X = sc.fit_transform(X)
# --- Determine classes (y variable) in training set
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()
# --- Separating the dataset into training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Select K-Best Features
def k_best_features(X, y, num_features):
    bestFeatures = SelectKBest(score_func=chi2,  k=10)
    fit = bestFeatures.fit(X, y)
    datascores = pd.DataFrame(fit.scores_)
    datacolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([datacolumns, datascores], axis=1)
    featureScores.columns = ['Spec', 'Score']
    print(featureScores.nlargest(10, 'Score'))
# Feature Importance
def feature_importance(X,y,num_features):
    modelFI = ExtraTreesClassifier()
    modelFI.fit(X,y)
    #print(modelFI.feature_importances_)
    featureImportances = pd.Series(modelFI.feature_importances_, index=X.columns)
    featureImportances.nlargest(10).plot(kind='barh')
    plt.show()
# Correlation Matrix with Heatmap
def cormat_heatmap(X,y, num_features):
    cormat = dataset.corr()
    top_cor_ftr = cormat.index
    plt.figure(figsize=(20,20))
    # plot heat map
    g = sns.heatmap(dataset[top_cor_ftr].corr(), annot=True, cmap="RdYlGn")
    # Correlation with output variable
    cor_target = abs(cormat["Class"])
    # Selecting highly correlated features
    relevant_features = cor_target[cor_target > 0.5]
    print(relevant_features)
# Recrusive Feature Elimination
def recrusive_feature_elimination(X, y, num_features):
    rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=10, random_state=10), n_features_to_select=10, step=2)
    rfe_selector.fit(X,y)
    rfe_support = rfe_selector.get_support()
    rfe_features = X.loc[:,rfe_support].columns.tolist()
    print(rfe_features)
    print(str(len(rfe_features)), 'selected features')
# Lasso: SelectFromModel
def select_from_model(X, y, num_features):
    embedded_linear_selector = SelectFromModel(LogisticRegression(penalty='l2'), max_features=10)
    embedded_linear_selector.fit(X,y)
    embedded_linear_selector = embedded_linear_selector.get_support()
    embedded_linear_features = X.loc[:,embedded_linear_selector].columns.tolist()
    print(embedded_linear_features)
    print(str(len(embedded_linear_features)), 'selected features')
# Tree-based: SelectFromModel
def treebased_SelectFromModel(X, y, num_features):
    embedded_randomforest_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=10)
    embedded_randomforest_selector.fit(X,y)
    embedded_randomforest_selector = embedded_randomforest_selector.get_support()
    embedded_randomforest_features = X.loc[:,embedded_randomforest_selector].columns.tolist()
    print(embedded_randomforest_features)
    print(str(len(embedded_randomforest_features)), 'selected features')


In [None]:
if __name__ == '__main__':
    k_best_features(X, y, num_features)
    feature_importance(X, y, num_features)
    recrusive_feature_elimination(X, y, num_features)
    select_from_model(X, y, num_features)
    treebased_SelectFromModel(X, y, num_features)
