# Feature selection with random forests

Feature importance is measured as averaged information gain from all decision trees in a random forest.
The algorithm works also with nonlinearly separable data.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Load the dataset

In [2]:
df = pd.read_csv('../Datasets/Iris.csv')

## Explore the dataset

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [5]:
df.shape

(150, 5)

In [6]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [7]:
df['species'].value_counts(normalize=True)

setosa        0.333333
versicolor    0.333333
virginica     0.333333
Name: species, dtype: float64

## Define features and target variables

In [8]:
colnames = df.columns.values
x = df[colnames[0:4]]
y = df[colnames[4]]

## Split into train and test sets

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

## Train the model

In [10]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Evaluate the model

In [11]:
rfc.score(x_test,y_test)

0.94736842105263153

## Feature importance

In [12]:
fi = pd.DataFrame({'Feature':colnames[0:4],'Importance': rfc.feature_importances_})
fi.sort_values('Importance',ascending=False)

Unnamed: 0,Feature,Importance
2,petal_length,0.433013
3,petal_width,0.408682
0,sepal_length,0.120495
1,sepal_width,0.037809
