<a href="https://colab.research.google.com/github/ayushi15092002/mental-health-workload/blob/main/Fucntional_Connectivity_Feature_Selection_Feature_Shuffling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
from sklearn.metrics import roc_auc_score


## Load Dataset

In [None]:
data = pd.read_excel(r'/content/drive/My Drive/drdo/features/min_max_fun_conn.xlsx')

In [None]:
data.head(5)

Unnamed: 0,Coherence,Imaginary Coherence,Phase Locking Value,corrected imaginary PLV,Pairwise Phase Consistency,Phase Lag Index (PLI),Directed Phase Lag Index (DPLI),Weighted Phase Lag Index (WPLI),Debiased estimator of squared WPLI,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.291424,0.0,0.0,0.000636,0.0,0.0,0.0,0.069174,1.0
2,0.0,0.291424,0.0,0.0,0.000636,0.0,0.0,0.0,0.069174,1.0
3,0.0,0.291424,0.0,0.0,0.000636,0.0,0.0,0.0,0.069174,1.0
4,0.0,0.291424,0.0,0.0,0.000636,0.0,0.0,0.0,0.069174,1.0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['y'], axis=1), 
                                                    data.y, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((1880, 9), (471, 9))

In [None]:
y_train

2108    3.0
436     1.0
1538    2.0
342     1.0
713     1.0
       ... 
1033    2.0
1731    3.0
763     1.0
835     2.0
1653    3.0
Name: y, Length: 1881, dtype: float64

##  Feature Shuffling
permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.
If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics.

In [None]:
auc_drop, selected_features = feature_shuffle_rf(X_train=X_train,y_train=y_train,random_state=0)

[0.08       0.34117574 0.34117574 ... 0.18       0.34117574 0.1       ]
0.8270189952517566
0.8376249537015961
0.844167088212791
0.8300759715249278
0.8324521564703139
0.8384596585710463
0.8263559243028987
0.8339155372341067
0.8407233809513391


In [None]:
#  we select features that have auc_drop > 0
auc_drop

Unnamed: 0,feature,auc_drop
6,Directed Phase Lag Index (DPLI),0.040014
0,Coherence,0.039351
3,corrected imaginary PLV,0.036294
4,Pairwise Phase Consistency,0.033917
7,Weighted Phase Lag Index (WPLI),0.032454
1,Imaginary Coherence,0.028745
5,Phase Lag Index (PLI),0.02791
8,Debiased estimator of squared WPLI,0.025646
2,Phase Locking Value,0.022202


In [None]:
selected_features

6       Directed Phase Lag Index (DPLI)
0                             Coherence
3               corrected imaginary PLV
4            Pairwise Phase Consistency
7       Weighted Phase Lag Index (WPLI)
1                   Imaginary Coherence
5                 Phase Lag Index (PLI)
8    Debiased estimator of squared WPLI
2                   Phase Locking Value
Name: feature, dtype: object

In [None]:
def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
    
    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
                                    random_state=random_state,class_weight=class_weight,
                                    n_jobs=-1)
    model.fit(X_train, y_train)
    print(model.predict_proba(X_train)[:, 1])
    train_auc = roc_auc_score(y_train, (model.predict_proba(X_train)),multi_class = 'ovr', average = 'weighted')
    feature_dict = {}

    # selection  logic
    for feature in X_train.columns:
        X_train_c = X_train.copy().reset_index(drop=True)
        y_train_c = y_train.copy().reset_index(drop=True)
        
        # shuffle individual feature
        X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
            drop=True)
        #print(X_train_c.isnull().sum())
        # make prediction with shuffled feature and calculate roc-auc
        shuff_auc = roc_auc_score(y_train_c,
                                  (model.predict_proba(X_train_c)),multi_class = 'ovr', average = 'weighted')
        print(shuff_auc)
        # save the drop in roc-auc
        feature_dict[feature] = (train_auc - shuff_auc)
        #print(feature_dict)
    
    auc_drop = pd.Series(feature_dict).reset_index()
    auc_drop.columns = ['feature', 'auc_drop']
    auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
    selected_features = auc_drop[auc_drop.auc_drop>0]['feature']

    return auc_drop, selected_features