In [1]:
def reliefF(X, y, k_neighbors=3):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    class_probs = np.bincount(y) / len(y)
    max_val = np.max(X, axis=0)
    min_val = np.min(X, axis=0)
    
    m = n_samples
    
    selected_indices = []
    
    for i in range(m):
        valid_indices = np.setdiff1d(np.arange(n_samples), selected_indices)
        
        random_index = np.random.choice(valid_indices)
        
        selected_indices.append(random_index)
        
        distances = np.sum(np.abs(X - X[random_index]), axis=1)
        
        nearest_indices = np.argsort(distances)
        
        nearest_same_class = nearest_indices[(y[nearest_indices] == y[random_index]) & (nearest_indices != random_index)][:k_neighbors]
        
        nearest_diff_class = nearest_indices[(y[nearest_indices] != y[random_index])][:k_neighbors]
        
        for j in range(n_features):
            nearest_same_dist = distances[nearest_same_class]
            maxmin = (max_val[j] - min_val[j]) if (max_val[j] - min_val[j]) != 0 else 1
            weights_same = (np.abs(X[random_index, j] - X[nearest_same_class, j]) / maxmin)
            weights[j] -= (np.sum(weights_same) / (k_neighbors * m))
            
            weights_diff = 0
            for C, class_prob in enumerate(class_probs):
                if C != y[random_index]:
                    miss_prob = (class_prob / (1 - class_probs[y[random_index]]))
                    nearest_diff_dist = distances[nearest_diff_class]
                    miss_weight = np.abs(X[random_index, j] - X[nearest_diff_class, j])
                    weights_diff += (miss_prob * np.sum(miss_weight) / (k_neighbors * m))
            
            weights[j] += (weights_diff / (k_neighbors * m))
    
    return weights


In [2]:
# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select
from sklearn.feature_selection import f_regression
def min_redun_max_relev(X, y, k):
    # compute F-statistics and initialize correlation matrix
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # initialize list of feature scores
    scores = []
    scores_ith = []

    redundancy = []
    relevancy = []
    # repeat K times
    for i in range(k):
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)

        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        relevancy.append(F.loc[not_selected])
        redundancy.append(corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001))

        scores_ith.append(score)
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

        # add feature name and score to list of feature scores
        scores.append((best, score[best]))
        
    # create DataFrame of feature scores
    score_df = pd.DataFrame(scores, columns=['mRMR', 'Highest_score_each_iteration'])
    return scores,selected,scores_ith,score_df,relevancy,redundancy
    