# Feature selection with random forest

## Import libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

from google.colab import files # For download images and fiels

## Functions

In [None]:
def Count_Members(df):
    # Initialize an empty dictionary to store the counts
    member_counts = {}

    # Iterate over each column in the dataframe
    for column in df.columns:
        # Iterate over each value in the column
        for value in df[column]:
            # If the value is not NaN (missing value)
            if pd.notnull(value):
                # Count the occurrence of the value
                if value in member_counts:
                    member_counts[value] += 1
                else:
                    member_counts[value] = 1

    # Create a new dataframe from the member counts dictionary
    result_df = pd.DataFrame.from_dict(member_counts, orient='index', columns=['Quantity'])
    result_df.index.name = 'Member'

    return result_df

## Import data

In [None]:
file_name = 'Duck data after a filter of mean, variance and skewness in three different sizes' + '.csv'
data = pd.read_csv(file_name)

# The file is read with a column of indexes and this line removes it
data = data.drop(data.columns[0], axis=1)

# Remove the segmented image layer
X = data.drop('segmented_image', axis=1)

y = data['segmented_image']

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
X_scale = StandardScaler().fit_transform(X)

# Returns the names of the columns
X = pd.DataFrame(X_scale, columns=X.columns)

## Random forest feature importances

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

### Visual feature importances

In [None]:
feature_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(17, 15))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score', fontsize=15)
plt.ylabel('Features', fontsize=15)
plt.title("Visualizing Important Features", fontsize=25, pad=15)
plt.show()

## Find the best features in different random state

In [None]:
most_important_features = pd.DataFrame()

for i in (range(10)):
  rf = RandomForestClassifier(n_estimators=100, random_state=i)
  rf.fit(X, y)

  feature_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
  most_important_features['state ' + str(i)] = feature_imp.index[0:20]
  print('i = ' + str(i))

i = 0
i = 1
i = 2
i = 3
i = 4
i = 5
i = 6
i = 7
i = 8
i = 9


In [None]:
features_count = Count_Members(most_important_features)
features_count = features_count.sort_values(by= 'Quantity', ascending=False)

In [None]:
features_count

In [None]:
# save the feature importances count as csv
file_name = 'Duck random forest feature importances' + '.csv'

features_count.to_csv(file_name)
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>