In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn import datasets

In [None]:
iris = pd.read_csv('iris.csv')

In [None]:
iris.head()

In [None]:
iris.describe()

In [None]:
plt.hist(iris['petal_length'])

In [None]:
shortpetals = iris.loc[iris['petal_length'] <= 2.5]
shortpetals['species'].unique()

In [None]:
longpetals = iris.loc[iris['petal_length'] >= 2.5]
longpetals['species'].unique()

In [None]:
sns.pairplot(iris, hue='species')

In [None]:
x = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = iris['species']
with sns.axes_style('ticks'):
    g = sns.factorplot('petal_length', 'petal_width', 'species', data = iris, kind = 'box')
    g.set_axis_labels('Petal Length', 'Petal Width')

In [None]:
with sns.axes_style('ticks'):
    g = sns.catplot('petal_length', 'sepal_width', 'species', data = iris, kind = 'box')
    g.set_axis_labels('Petal Length', 'Sepal Width')

In [None]:
iris['plround'] = round(iris['petal_length'])
iris['pwround'] = round(iris['petal_width'])
iris['slround'] = round(iris['sepal_length'])
iris['swround'] = round(iris['sepal_width'])
iris.head()

In [None]:
with sns.axes_style('ticks'):
    g = sns.catplot('plround', 'petal_width', 'species', data = iris, kind = 'box')
    g.set_axis_labels('Petal Length', 'Petal Width')
    plt.axvline(2.5, color = 'k', linestyle = '--')
    plt.axhline(0.75, color = 'k', linestyle = '--')
    plt.axhline(1.6, color = 'k', linestyle = '--')
    plt.text(3.0, 1.75, 'virginica', size = 8, color = 'green')

In [None]:
setosas = iris.loc[iris['species'] == 'setosa']
virginica = iris.loc[iris['species'] == 'virginica']
versicolor = iris.loc[iris['species'] == 'versicolor']

plt.hexbin(setosas['petal_width'], setosas['petal_length'], gridsize = 5, cmap='Blues')
plt.xlim(0, 2.6)
plt.ylim(0, 7)
plt.colorbar()

In [None]:
plt.hexbin(virginica['petal_width'], virginica['petal_length'], gridsize = 5, cmap='Greens')
plt.xlim(0, 2.6)
plt.ylim(0, 7)
plt.colorbar()

In [None]:
plt.hexbin(versicolor['petal_width'], versicolor['petal_length'], gridsize = 5, cmap='Oranges')
plt.xlim(0, 2.6)
plt.ylim(0, 7)
plt.colorbar()

In [None]:
hfont = {'fontname':'Times New Roman'}
import matplotlib.patheffects as pe

plt.hexbin(setosas['petal_width'], setosas['petal_length'], gridsize = 3, cmap='Blues')
plt.hexbin(virginica['petal_width'], virginica['petal_length'], gridsize = 5, cmap='Greens', alpha = 0.7)
plt.hexbin(versicolor['petal_width'], versicolor['petal_length'], gridsize = 5, cmap='Oranges', alpha = 0.7)
plt.hexbin([0, 1, 2, 3], [0,2,5,7], gridsize = 15, cmap='coolwarm',alpha = 0.01)
plt.xlim(0, 2.6)
plt.ylim(0, 7)
plt.xlabel('Petal Width (cm)', **hfont)
plt.ylabel('Petal Length (cm)', **hfont)
plt.title('Petal Dimensions by Iris Species', size = 20, **hfont)
plt.text(0.32, 0.9, 'Setosa', size = 12, color = 'darkblue', **hfont)
plt.text(1.4, 2.9, 'Versicolor', size = 12, color = 'brown',**hfont)
plt.text(2.2, 4.35, 'Virginica', size = 12, color = 'darkgreen',**hfont)
plt.axhline(2.5, color = 'k', linestyle = '-.', alpha = 0.2)
ctry = np.array([1.464, 4.26, 5.552, 1.5])
ctry2 = ctry - 0.05
ctrx = np.array([0.244, 1.326, 2.026, 1.6])
ctrx2 = ctrx + 0.01
plt.text(1.65, 1.35, '= Species Mean', size = 12, color = 'k', **hfont)
plt.scatter(ctrx, ctry, s=150, color='w', marker = '.', zorder = 2)
plt.scatter(ctrx2, ctry2, s=150, color='k', marker = '.', alpha=0.8, zorder = 1)
plt.show()


In [None]:
print("Average Setosa:", setosas['petal_length'].mean(), 'cm long,', setosas['petal_width'].mean(), 'cm wide.')
print("Average Versicolor:", versicolor['petal_length'].mean(), 'cm long,', versicolor['petal_width'].mean(), 'cm wide.')
print("Average Virginica:", virginica['petal_length'].mean(), 'cm long,', virginica['petal_width'].mean(), 'cm wide.')

In [None]:
from mpl_toolkits import mplot3d

In [None]:
setosas['zaxisval'] = 1
versicolor['zaxisval'] = 2
virginica['zaxisval'] = 3
iris2 = pd.concat([setosas, versicolor, virginica])
iris2.tail()

In [None]:
import matplotlib
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue","darkorange","green"])

In [None]:
from matplotlib.patches import Rectangle, PathPatch
from matplotlib.transforms import Affine2D
import mpl_toolkits.mplot3d.art3d as art3d
X = iris2['petal_length']
y = iris2['petal_width']
z = iris2['zaxisval']

xs = X
ys = y
zs = z


from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax1 = plt.axes(projection='3d')
ax1.set_zticks([1,2,3])
ax1.set_zticklabels(['Setosa','Versicolor','Virginica'])
p6 = Rectangle((0, 0), 8, 3, facecolor='lightblue', edgecolor='blue', label='Setosa', alpha=0.1)
ax1.add_patch(p6)
art3d.pathpatch_2d_to_3d(p6, z=1, zdir="z")
p7 = Rectangle((0, 0), 8, 3, facecolor='orange', edgecolor='darkorange', label='Versicolor', alpha=0.1)
ax1.add_patch(p7)
art3d.pathpatch_2d_to_3d(p7, z=2, zdir="z")
p8 = Rectangle((0, 0), 8, 3, facecolor='lightgreen', edgecolor='green', label='Virginica', alpha=0.1)
ax1.add_patch(p8)
art3d.pathpatch_2d_to_3d(p8, z=3, zdir="z")
ax1.scatter(xs, ys, zs, marker='.', c=z, cmap=cmap, depthshade=True)
ax1.set_xlabel("Petal Length (cm)")
ax1.set_ylabel("Petal Width (cm)")
ax1.set_title("Petal Length vs. Petal Width by Iris Species", size = 20, **hfont)
plt.tight_layout()
for tick in ax1.get_xticklabels():
    tick.set_fontname("Times New Roman")
for tick in ax1.get_yticklabels():
    tick.set_fontname("Times New Roman")
for tick in ax1.get_zticklabels():
    tick.set_fontname("Times New Roman")


In [None]:
#fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
#ax.set_zticks([1,2,3])
#ax.set_zticklabels(['Setosa','Versicolor','Virginica'])
#p = Rectangle((0, 0), 8, 3, color='lightblue', label='Setosa', alpha=0.4)
#ax.add_patch(p)
#art3d.pathpatch_2d_to_3d(p, z=1, zdir="z")
#p2 = Rectangle((0, 0), 8, 3, color='orange', label='Versicolor', alpha=0.4)
#ax.add_patch(p2)
#art3d.pathpatch_2d_to_3d(p2, z=2, zdir="z")
#p3 = Rectangle((0, 0), 8, 3, color='lightgreen', label='Virginica', alpha=0.4)
#ax.add_patch(p3)
#art3d.pathpatch_2d_to_3d(p3, z=3, zdir="z")
#p4 = Rectangle((0, 0), 2.5, 3.5, facecolor='white', edgecolor='black', alpha=0.3)
#ax.add_patch(p4)
#art3d.pathpatch_2d_to_3d(p4, z=2.5, zdir="x")
#p5 = Rectangle((0, 0), 8, 3.5, facecolor='white', edgecolor='black', alpha=0.3)
#ax.add_patch(p5)
#art3d.pathpatch_2d_to_3d(p5, z=0.8, zdir="y")
#ax.scatter(xs, ys, zs, marker='.', c=z, cmap=cmap, depthshade=True)
#ax.set_xlabel("Petal Length (cm)")
#ax.set_ylabel("Petal Width (cm)")
#ax.set_title("Petal Length vs. Petal Width by Iris Species")
#plt.tight_layout()



In [None]:
iris.head()

In [None]:
iris['sepalratio'] = iris['sepal_length'] / iris['sepal_width']
iris['petalratio'] = iris['petal_length'] / iris['petal_width']
iris.head()

In [None]:
iris['sepalratio'].describe()

In [None]:
fig = plt.figure()
ax = plt.axes()
sns.kdeplot(iris['sepalratio'], shade=True)
ax.set_xlabel('Sepal Length/Width Ratio', **hfont)
ax.set_ylabel('Frequency', **hfont)

In [None]:
iris['petalratio'].describe()

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.set_xlabel('Petal Length/Width Ratio', **hfont)
ax.set_ylabel('Frequency', **hfont)
sns.kdeplot(iris['petalratio'])

In [None]:
setosas['sepalratio'] = setosas['sepal_length'] / setosas['sepal_width']
setosas.head()

In [None]:
versicolor['sepalratio'] = versicolor['sepal_length'] / versicolor['sepal_width']
versicolor.head()

In [None]:
virginica['sepalratio'] = virginica['sepal_length'] / virginica['sepal_width']
virginica.head()

In [None]:
setosas['petalratio'] = setosas['petal_length'] / setosas['petal_width']
versicolor['petalratio'] = versicolor['petal_length'] / versicolor['petal_width']
virginica['petalratio'] = virginica['petal_length'] / virginica['petal_width']

setosas['splratio'] = setosas['petal_length']/setosas['sepal_length']
versicolor['splratio'] = versicolor['petal_length']/versicolor['sepal_length']
virginica['splratio'] = virginica['petal_length']/virginica['sepal_length']

setosas['ldiff'] = setosas['sepal_length'] - setosas['petal_length']
versicolor['ldiff'] = versicolor['sepal_length'] - versicolor['petal_length']
virginica['ldiff'] = virginica['sepal_length'] - virginica['petal_length']


In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=4, sharey=True, figsize=(15,5))
sns.kdeplot(setosas['sepalratio'], shade=True, label = 'Setosa', ax=ax1)
sns.kdeplot(versicolor['sepalratio'], shade=True, label = 'Versicolor', ax=ax1)
sns.kdeplot(virginica['sepalratio'], shade=True, label = 'Virginica', ax=ax1)
ax1.set_xlabel('Sepal Length/Width Ratio')
ax1.set_ylabel('Kernel Density Estimate')

sns.kdeplot(setosas['petalratio'], shade=True, label = 'Setosa', ax=ax2)
sns.kdeplot(versicolor['petalratio'], shade=True, label = 'Versicolor', ax=ax2)
sns.kdeplot(virginica['petalratio'], shade=True, label = 'Virginica', ax=ax2)
ax2.set_xlabel('Petal Length/Width Ratio')
ax2.set_ylabel('Kernel Density Estimate')

sns.kdeplot(setosas['splratio'], shade=True, label='Setosa', ax=ax3)
sns.kdeplot(versicolor['splratio'], shade=True, label='Versicolor', ax=ax3)
sns.kdeplot(virginica['splratio'], shade=True, label='Virginica', ax=ax3)
ax3.set_xlabel('Petal vs. Sepal Length Ratio')
ax3.set_ylabel('Kernel Density Estimate')

sns.kdeplot(setosas['ldiff'], shade=True, label='Setosa', ax=ax4)
sns.kdeplot(versicolor['ldiff'], shade=True, label='Versicolor', ax=ax4)
sns.kdeplot(virginica['ldiff'], shade=True, label='Virginica', ax=ax4)
ax4.set_xlabel('Sepal Length - Petal Length')
ax4.set_ylabel('Kernel Density Estimate')

plt.suptitle('KDEs of Petal and Sepal Length/Width Ratios')

Versicolor and virginica are evidently more closely related to each other than to setosa. As such, they are nearly indistinguishible when using kernel density estimates of dimension relationships (i.e. length/width ratios of the petals and sepals respectively.) According to the third graph, petal vs. sepal length ratio strictly divides setosa from the other two species, and demonstrates a bit less overlap between versicolor and virginica. Thus, when building a model, this factor would be a strong choice (put plainly, how long the petal is compared to the sepal.)

In [None]:
iris.head()

In [None]:
irises = pd.concat([setosas, versicolor, virginica])

In [None]:
irises.head()

In [None]:
irises['spwratio'] = irises['petal_width']/irises['sepal_width']
irisesc = irises.copy()
ir = irisesc[['sepalratio', 'petalratio', 'splratio', 'sepal_length','sepal_width','petal_width','spwratio','species']]
ir.head()

In [None]:
sns.pairplot(ir, hue = 'species')

To my eye, the most effect separation of datapoints occurs when comparing splratio (petal to sepal length) and petal_width (petal width). I think perhaps given these two observations, you could predict with a pretty high level of confidence the species of the iris. As in the 3-dimensional plot above, it's obvious that petal length and width are both required to make a quality prediction. You could imagine particular length to width ratios as different petal shapes -- put this way it becomes somewhat obvious.  

In [None]:
plt.scatter(ir['petal_width'], ir['splratio'], c=z, cmap=cmap)
plt.colorbar()

In [None]:
line = ir[['petal_width','splratio']]
line.plot(kind='line')

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

X = ir['petal_width'].values.reshape(-1,1)
y = ir['splratio'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

reg = LinearRegression()
reg.fit(X_train, y_train)

#To retrieve the intercept:
print(reg.intercept_)
#For retrieving the slope:
print(reg.coef_)

In [None]:
y_pred = reg.predict(X_test)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df

In [None]:
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()

In [None]:
from sklearn.decomposition import PCA
X = ir[['splratio','petal_width']]

model = PCA(n_components=2)
model.fit(X)
X_2D = model.transform(X)

ir['PCA1'] = X_2D[:,0]
ir['PCA2'] = X_2D[:,1]
sns.lmplot('PCA1', 'PCA2', hue='species', data=ir, fit_reg=False)

In [None]:
from sklearn.mixture import GaussianMixture
model2 = GaussianMixture(n_components = 3, covariance_type='full')
model2.fit(X_train)
y_gmm = model2.predict(X_train)
X_train['cluster'] = y_gmm
sns.lmplot('PCA1', 'PCA2', data=ir, hue='species', col = 'cluster', fit_reg=False)



In [None]:
X_train

In [None]:
species = model2.predict(X)
ratio = np.array(X_train['splratio'])
width = np.array(X_train['petal_width'])
#PCA1 vs PCA2, color is the predicted category
plt.scatter(ratio, width, c=y_gmm, s=40, cmap='viridis')

In [None]:
probs = model2.predict_proba(X)
print(probs[95:105].round(3))
size = 50 * probs.max(1) ** 2

In [None]:
plt.style.use('fivethirtyeight')
plt.scatter(x0, x1, c=y_gmm, cmap='Dark2', s=size)

In [None]:
import pickle
filename = 'irisgmm.sav'
pickle.dump(model2, open(filename, 'wb'))

In [None]:
Xnew = [[2, 0.8]]
ynew = model2.predict(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [None]:
x0 = np.array(X['splratio'])
x1 = np.array(X['petal_width'])

In [None]:
plt.scatter(x0, x1, c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [None]:
filename2 = 'iriskmeans.sav'
pickle.dump(kmeans, open(filename2, 'wb'))

In [None]:
import string
tryAgain = 'yes'
def speciespredict():
    rat = input("Petal to Sepal Length Ratio:")
    wid = input("Petal Width (cm):")
    Xnew = [[rat, wid]]
    ynew = kmeans.predict(Xnew)
    if ynew == 0:
        print("Setosa")
    elif ynew == 1:
        print("Versicolor")
    elif ynew == 2:
        print("Virginica")
    inp = input("Enter 'yes' to try again, or anything else to exit: ")
    if inp.lower() != 'yes':
        exit()

In [None]:
while tryAgain == 'yes':
    speciespredict()

Petal to Sepal Length Ratio: 0.8
Petal Width (cm): 1.7


Virginica
