In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('data/housing.csv')

In [None]:
data.shape

# Constant Features

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.drop(['SalePrice'], axis=1), 
                                                    data.SalePrice, test_size=0.3, 
                                                    random_state=0)

In [None]:
numerical_x_train = x_train[x_train.select_dtypes([np.number]).columns]

In [None]:
from sklearn.feature_selection import VarianceThreshold
vs_constant = VarianceThreshold(threshold=0)

In [None]:
vs_constant.fit(numerical_x_train)

In [None]:
len(x_train[x_train.select_dtypes([np.number]).columns].columns[vs_constant.get_support()])

In [None]:
constant_columns = [column for column in numerical_x_train.columns
                    if column not in numerical_x_train.columns[vs_constant.get_support()]]

In [None]:
len(constant_columns)

In [None]:
constant_cat_columns = [column for column in x_train.columns 
                        if (x_train[column].dtype == "O" and len(x_train[column].unique())  == 1 )]

In [None]:
all_constant_columns = constant_cat_columns + constant_columns

In [None]:
x_train.drop(labels=constant_columns, axis=1, inplace=True)
x_test.drop(labels=constant_columns, axis=1, inplace=True)

# Quasi Constant features

In [None]:
def constant_feature_detect(data,threshold=0.98):
    
    quasi_constant_feature = []
    for feature in data.columns:
        predominant = (data[feature].value_counts() / np.float(len(data))).sort_values(ascending=False).values[0]
        if predominant >= threshold:
            quasi_constant_feature.append(feature)   
    return quasi_constant_feature

In [None]:
x_train.drop(labels=constant_feature_detect(x_train), axis=1, inplace=True)
x_test.drop(labels=constant_feature_detect(x_train), axis=1, inplace=True)

# Duplicated Features

In [None]:
train_features_T = x_train.T
train_features_T.head()

In [None]:
print(train_features_T.duplicated().sum())

In [None]:
duplicated_columns = train_features_T[train_features_T.duplicated()].index.values

In [None]:
x_train.drop(labels=duplicated_columns, axis=1, inplace=True)
x_test.drop(labels=duplicated_columns, axis=1, inplace=True)

In [None]:
corr_feature_detect(data)

# Correlation methods

In [None]:
correlated_features = set()
correlation_matrix = x_train.corr()

In [None]:
plt.figure(figsize=(11,11))
sns.heatmap(correlation_matrix)

In [None]:
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
correlated_features

In [None]:
x_train.drop(labels=correlated_features, axis=1, inplace=True)
x_test.drop(labels=correlated_features, axis=1, inplace=True)

# Statistical Measures

In [None]:
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

### Mutual Information

In [None]:
def mutual_info(X,y,select_k=10):
    
    if select_k >= 1:
        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
        
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
        
    else:
        raise ValueError("select_k must be a positive number")
    
    return col

In [None]:
mutual_info(x_train, y_train)

### Chi Squared Score

In [None]:
def chi_square_test(X,y,select_k=10):
    if select_k >= 1:
        sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
    else:
        raise ValueError("select_k must be a positive number")  
    
    return col

In [None]:
chi_square_test(x_train, y_train)

### Univariate RMSE

In [None]:
def univariate_rmse():
    mse_values = []
    for feature in x_train.columns:
        clf = DecisionTreeRegressor()
        clf.fit(x_train[feature].to_frame(), y_train)
        y_scored = clf.predict(x_test[feature].to_frame())
        mse_values.append(mean_squared_error(y_test, y_scored))
    mse_values = pd.Series(mse_values)
    mse_values.index = x_train.columns
    print(mse_values.sort_values(ascending=False))
    print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(x_train.columns))
    keep_col = mse_values[mse_values > threshold]
    return keep_col   

In [None]:
univariate_rmse()

### Univariate ROC-AUC

In [None]:
def univariate_roc_auc():
    roc_values = []
    for feature in x_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(x_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(x_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    print(roc_values.sort_values(ascending=False))
    print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
    keep_col = roc_values[roc_values > threshold]
    return keep_col

In [None]:
# use it for a classification task.
# univariate_roc_auc()