In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline

## load dataset

In [None]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['target'] = iris.target

def add_label_names(i):
    return iris.target_names[i]
df['label'] = df['target'].apply(add_label_names)

## initial data exploration 

In [None]:
df.head()

In [None]:
df.sample(frac=.1)

In [None]:
df.describe()

## comparing features

In [None]:
for f in df.columns[:4]:
    for ff in df.columns[:4]:
        if ff != f:
            df.plot.scatter(x=f, y = ff)

## classification

In [None]:
# get features and responses

X = df.iloc[:, :4]
y = df['target']

# make train test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [None]:
# initialize and fit model

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
# make series of feature importance and plot

features = pd.Series(rfc.feature_importances_, # weights are the values in the series
                     index=df.columns[:4]) # feature names are the index, show up as labels on the plot

features = features.sort_values()

features.plot(kind='bar')

In [None]:
df.columns

petal features are more important, according to our classifier. so what happens if we plot sepal variables and petal variables?

**note:** if the categories overlap considerably, plot them separately

In [None]:
# plot by sepal length/width

ax = df[df['label'] == 'setosa'].plot.scatter(x = 'sepal length (cm)',
                                             y = 'sepal width (cm)',
                                             color = '#7fc97f',
                                             label = 'setosa')

ax = df[df['label'] == 'virginica'].plot.scatter(x = 'sepal length (cm)',
                                             y = 'sepal width (cm)',
                                             color = '#beaed4',
                                             label = 'virginica',
                                             ax = ax)

ax = df[df['label'] == 'versicolor'].plot.scatter(x = 'sepal length (cm)',
                                                 y = 'sepal width (cm)',
                                                 color = "#fdc086",
                                                 label = 'versicolor',
                                                 ax = ax)

In [None]:
# plot by petal length/width

ax = df[df['label'] == 'setosa'].plot.scatter(x = 'petal length (cm)',
                                             y = 'petal width (cm)',
                                             color = '#7fc97f',
                                             label = 'setosa')

ax = df[df['label'] == 'virginica'].plot.scatter(x = 'petal length (cm)',
                                             y = 'petal width (cm)',
                                             color = '#beaed4',
                                             label = 'virginica',
                                             ax = ax)

ax = df[df['label'] == 'versicolor'].plot.scatter(x = 'petal length (cm)',
                                                 y = 'petal width (cm)',
                                                 color = "#fdc086",
                                                 label = 'versicolor',
                                                 ax = ax)

In [None]:
# plot by petal length and sepal length

ax = df[df['label'] == 'setosa'].plot.scatter(x = 'petal length (cm)',
                                             y = 'sepal length (cm)',
                                             color = '#7fc97f',
                                             label = 'setosa')

ax = df[df['label'] == 'virginica'].plot.scatter(x = 'petal length (cm)',
                                             y = 'sepal length (cm)',
                                             color = '#beaed4',
                                             label = 'virginica',
                                             ax = ax)

ax = df[df['label'] == 'versicolor'].plot.scatter(x = 'petal length (cm)',
                                                 y = 'sepal length (cm)',
                                                 color = "#fdc086",
                                                 label = 'versicolor',
                                                 ax = ax)

This again confirms the feature importances determined by the model. The classes are pretty well separated by petal length, but not nearly as much by sepal length.