# FEATURE SELECTION WITH RANDOM FOREST

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 

In [2]:
# Reading feature vector and removing unnamed column
df = pd.read_csv('__Random_Features\\features_random_bn-mouse_brain_1_final.csv').drop(columns='Unnamed: 0')

# Removing filenames
df = df.iloc[:, 1:]

# Removing not connected patterns
df = df.drop(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13'], 1)

In [3]:
# Prepare data to train. 
# X: Independent values
# y: Target values (Categories)
def prepareData(df, normalize=False):
    X = np.array(df.iloc[:, :-1])
    if normalize:
        return (X - X.mean()) / (X.max() - X.min()), np.array(df['Categories'])
    return X, np.array(df['Categories'])

In [4]:
X, y = prepareData(df)

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [6]:
# Creating a random forest classifier with 100 estimator
# to select the most important features

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=None, verbose=0,
  

In [7]:
# The feature columns which has importance more than mean has
# True value, others has False

sel.get_support()

array([False, False,  True,  True,  True,  True,  True, False, False,
        True, False, False,  True, False, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False])

In [8]:
# The number of the important features (which has importance more than mean)

selected_feat= df.iloc[:,:-1].columns[(sel.get_support())]
len(selected_feat)

20

In [9]:
# Names of that features
print(selected_feat)

Index(['Density', 'Max. Degree', 'Avg. Degree', 'Max. k-core',
       'Avg. Clustering Coeff.', 'Total Triangles', 'f15', 'f18', 'f19', 'f20',
       'f21', 'f23', 'f24', 'f25', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32'],
      dtype='object')


In [10]:
# Feature importance values.
pd.Series(sel.estimator_.feature_importances_.ravel(), index=df.iloc[:,:-1].columns)

Vertex                         0.000000
Edges                          0.022745
Density                        0.046892
Max. Degree                    0.032027
Avg. Degree                    0.036589
Max. k-core                    0.033479
Avg. Clustering Coeff.         0.043733
Diameter                       0.000000
Average Path Length            0.022821
Total Triangles                0.042836
Avg. Eigenvector Centrality    0.020120
f14                            0.030186
f15                            0.035945
f16                            0.006362
f17                            0.019533
f18                            0.040176
f19                            0.039967
f20                            0.036269
f21                            0.049282
f22                            0.013587
f23                            0.046479
f24                            0.033717
f25                            0.054705
f26                            0.020306
f27                            0.033322


## Result

With random forest classifier we built a tree and train that tree with our feature vector. As a result, there are some features that our classifier defined them with smaller coefficients. These features are relatively 'weak' features. 

They are:

- Vertex
- Edges
- Diameter
- Average Path Length
- Avg. Eigenvector Centrality
- f14 (4-star)
- f16 (4-path)
- f17 (Forktailed-tri)
- f22 (Hourglass)
- f26 (3-wedge-col	)
- f33 (Almost-5-clique)
- f34 (5-clique)