In [1]:
# standard library
import sys

# pydata stack
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import svm

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'seaborn version: {sns.__version__}')
print(f'scikit-learn: {sklearn.__version__}')

Python version: 3.9.12 (main, Apr  5 2022, 06:56:58) 
[GCC 7.5.0]
pandas version: 1.4.2


NameError: name 'sns' is not defined

---

## Load Dataset

In [3]:
iris = pd.read_csv('data/iris.csv')

---

## Explore with Pandas

In [4]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
iris['species'] = iris['species'].astype('category')

In [7]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal_length  150 non-null    float64 
 1   sepal_width   150 non-null    float64 
 2   petal_length  150 non-null    float64 
 3   petal_width   150 non-null    float64 
 4   species       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [8]:
values = ['count', 'min', 'max', 'mean', 'std']
iris.groupby(by='species').agg(values)

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,min,max,mean,std,count,min,max,mean,std,count,min,max,mean,std,count,min,max,mean,std
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
setosa,50,4.3,5.8,5.006,0.35249,50,2.3,4.4,3.418,0.381024,50,1.0,1.9,1.464,0.173511,50,0.1,0.6,0.244,0.10721
versicolor,50,4.9,7.0,5.936,0.516171,50,2.0,3.4,2.77,0.313798,50,3.0,5.1,4.26,0.469911,50,1.0,1.8,1.326,0.197753
virginica,50,4.9,7.9,6.588,0.63588,50,2.2,3.8,2.974,0.322497,50,4.5,6.9,5.552,0.551895,50,1.4,2.5,2.026,0.27465


---

## Visualization

In [9]:
iris.plot.hist()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [None]:
iris['sepal_length'].plot.hist(by='species')

In [None]:
iris['sepal_width'].plot.hist()

In [None]:
iris['petal_length'].plot.hist()

In [None]:
iris['petal_width'].plot.hist()

---

In [None]:
iris.boxplot(by='species', figsize=(20, 15))

---

## Multidimensional Plots

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
scatter_matrix(iris, figsize=(12, 12), diagonal='kde')

In [None]:
from pandas.plotting import andrews_curves

In [None]:
plt.figure(figsize=(10, 10))
andrews_curves(iris, 'species')

In [None]:
from pandas.plotting import parallel_coordinates

In [None]:
plt.figure(figsize=(10, 10))
parallel_coordinates(iris, 'species')

In [None]:
from pandas.plotting import radviz

In [None]:
plt.figure(figsize=(10, 10))
radviz(iris, 'species')

---

## Seaborn Visualization

In [None]:
sns.set(color_codes=True)

### Distributions

In [None]:
sns.distplot(iris['sepal_width'])

In [None]:
sns.distplot(iris['sepal_width'],
             hist=True,
             kde=True,
             rug=True,
             bins=5)

In [None]:
# Kernel Density Estimate... can specify the kind f kernel
sns.kdeplot(iris['sepal_width'],
            shade=True,
            kernel='gau',
            bw=.13)

In [None]:
from scipy import stats

In [None]:
sns.distplot(iris['petal_width'],
             hist=True,
             kde=True,
             fit=stats.expon) ## pass in random variable object

In [None]:
# Jointplot (bivariate distributions)
sns.jointplot(data=iris,
              x='sepal_length',
              y='sepal_width',
              kind='scatter')

In [None]:
# pairwise relationship
sns.pairplot(iris,
             hue='species')

In [None]:
g = sns.PairGrid(iris)
g.map_diag(sns.kdeplot, shade=True)
g.map_offdiag(sns.kdeplot, cmap="Blues_r", n_levels=5)

### Categorical Data

In [None]:
iris['above_sepal_width_mean'] = iris['sepal_width'] > iris['sepal_width'].mean()

#### Scatterplot

In [None]:
sns.stripplot(data=iris,
              x='species',
              y='sepal_width',
              hue='above_sepal_width_mean',
              jitter=True)

#### Distributions

In [None]:
sns.violinplot(data=iris,
               x='species',
               y='sepal_width',
               hue='above_sepal_width_mean',
               inner='box')

---

## Build and plot Support Vector Machine classifers

Adapted from [sklearn docs](http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html)

In [None]:
# convert to sklearn format, grab first 2 columns
X = np.array(iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])[:, :2]

# convert strings to categorical
le = sklearn.preprocessing.LabelEncoder()
le.fit(iris['species'])
y = le.transform(iris['species'])

In [None]:
h = .02  # step size in the mesh

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
svc = sklearn.svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = sklearn.svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = sklearn.svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = sklearn.svm.LinearSVC(C=C).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel']


for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(2, 2, i + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[i])

plt.show()