## Content 

 <b>1 | Preprocessing Steps</b>
 
    1.1 Import Libraries
    1.2 Configure Settings
    1.3 Import Files

 <b>2 | Structure Investigation</b>
 
    2.1 Basic Steps
    2.2 Null Value Check
    2.3 Structure of non-numerical features
    2.4 Structure of numerical features
    
    
 <b>3 | Univariate Analysis</b>
 
    3.1 Summary for Columns (Describe)
    3.2 Individual Column Shape Observation
    3.3 Individual Column Value Observation (Min, Max and Outlier)

 <b>4 | Multivariate Analysis</b>
 
    4.1 Creating Relation Matrix and Graph
    4.2 Relation With Species
    4.3 Other Interesting Relationships
    
 <b>5 | Preprocessing</b>
 
 <b>6 | Model Selection</b>
 
 <b>7 | Prediction</b>

### 1 | Preprocessing Steps

#### 1.1 | Import Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import math
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

#### 1.2 | Configure Settings

In [None]:
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',100)          # Set now of columns visible to 100
pd.set_option('display.max_rows',100)          # Set now of rows visible to 100

#### 1.3 | Import Files

In [None]:
df_raw=pd.read_csv("/kaggle/input/iris/Iris.csv")
df=df_raw.copy()

### 2 | Structure Investigation

#### 2.1 | Basic Steps

In [None]:
df.shape 

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
pd.value_counts(df.dtypes)

***
#### 2.2 | Null Value Check
* No null values Observed
***

In [None]:
# Combined Plot

plt.subplots(1,2,figsize=(13,3))
plt.subplot(1,2,1)
plt.imshow(df.isna(), aspect="auto", interpolation="nearest", cmap="gray") # we can test any condition in df using this graph
plt.xlabel("Column Number")
plt.ylabel("Sample Number")
plt.title("DATASET VIEW - Black (Not Null) & While (Null)")
plt.subplot(1,2,2)
df.isna().mean().sort_values().plot( kind="bar", title="Null Value Ratio per Feature", ylabel="Ratio of missing values per feature")
plt.show()

***
#### 2.3 | Structure of non-numerical features
* Only 1 non numerical column
* Column value is nominal type
* Has 3 distinct values
***


In [None]:
# non-numerical features
df.select_dtypes(exclude="number").head()

In [None]:
df.describe(exclude="number")

In [None]:
df["Species"].value_counts()

#### 2.4 | Structure of numerical features

In [None]:
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=("Unique values per Categorical feature", "Unique values per Numerical feature"))


temp_data=df.select_dtypes(exclude="number").nunique().sort_values()

fig.add_trace(
    go.Bar(x=temp_data.index, y=temp_data.values),
    row=1, col=1
)



temp_data=df.select_dtypes(include="number").nunique().sort_values()

fig.add_trace(
    go.Bar(x=temp_data.index, y=temp_data.values),
    row=1, col=2
)

#fig.update_layout(showlegend=False)
fig.show()

***
#### 2.5 | Value Behaviour

* Each point in this figure is a sample (i.e. a row) in our dataset and each subplot represents a different feature. 
* The y-axis shows the feature value, while the x-axis is the sample index
***

* Observation

    * Values in each subpots appear to be in range 
    * No extreme values / Wrong Value observed 
***

In [None]:
# For all Numerical columnn for all rows 
df.plot(lw=0, marker=".", subplots=True, layout=(-1, 3),
          figsize=(12,5), markersize=5)
plt.tight_layout()

In [None]:
df.drop(columns='Id',inplace=True)

### 3 | Univariate Content Analysis

#### 3.1 | Summary for Columns (Describe)

In [None]:
df.describe()

***
#### 3.2 | Individual Column Shape 

***

In [None]:
# Plots the histogram for each numerical feature in a separate subplot
figsize_val=(12,5)
layout_val=(-1, 3)
df.hist(bins=10, figsize=figsize_val, layout=layout_val, edgecolor="black")
plt.tight_layout();

In [None]:
df.skew()

***
#### 3.3 | Individual Column Value Observation (Min, Max and Outlier)

* We have used min/max for finding corners ,  box plot to visualise and iqr method to list outlier countries
* SepalWidthCm is the only column with outliers
* Below is the min max and outlier for all columns    
 
<b> SepalLengthCm </b>
* Min Value -> 4.3 | Species ->  Iris-setosa
* Max Value -> 7.9 | Species ->  Iris-virginica
* Outliers ->  

***

<b>SepalWidthCm</b>
Column ->  SepalWidthCm 
* Min Value -> 2.0 | Species ->  Iris-versicolor
* Max Value -> 4.4 | Species ->  Iris-setosa
* Outliers ->  4.4, 4.1, 4.2, 2.0
 **********

<b>PetalLengthCm</b> 
* Min Value -> 1.0 | Species ->  Iris-setosa
* Max Value -> 6.9 | Species ->  Iris-virginica
* Outliers -> 

 **********

<b>PetalWidthCm</b> 
* Min Value -> 0.1 | Species ->  Iris-setosa
* Max Value -> 2.5 | Species ->  Iris-virginica
* Outliers -> 

 ************ 

In [None]:
vars = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
fig = make_subplots(rows=1, cols=len(vars))
for i, var in enumerate(vars):
    fig.add_trace(
        go.Box(y=df[var],
        name=var),
        row=1, col=i+1
    )

fig.update_traces(boxpoints='all', jitter=.3)
fig.update_layout(showlegend=False)
fig.show()

In [None]:
upper =df.quantile(0.75)+1.5 *(df.quantile(0.75)-df.quantile(0.25))
lower = df.quantile(0.25)-1.5 *(df.quantile(0.75)-df.quantile(0.25))

columns_to_be_checked=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

for i in columns_to_be_checked:
    temp_min=df[df[i]== df[i].min()][['Species',i]]
    temp_max=df[df[i]== df[i].max()][['Species',i]]
    
    print('\nColumn -> ',i,'\nMin Value ->',temp_min[i].values[0],'| Species -> ',
          str(set(temp_min['Species'].to_list())).replace("'",'').replace('{','').replace('}',''))
    
    print('Max Value ->',temp_max[i].values[0],'| Species -> ',
          str(set(temp_max['Species'].to_list())).replace("'",'').replace('{','').replace('}',''))
    
    x= df[ (df[i]<round(lower[i],2)) |  (df[i]>round(upper[i],2))][i].to_list()
    print('Outliers -> ',str(x).replace("'",'').replace('[','').replace(']',''))
    print('\n','*'*10)

### 4 | Multivariate Investigation

#### 4.1 | Creating Relation Matrix and Graph

In [None]:
temp_corr=df.corr()
temp_corr

In [None]:
fig = px.imshow(df.corr().round(2),zmin=-1,zmax=1, text_auto=True,width=1200,height=650,aspect=None,color_continuous_scale='rdbu')
fig.show()

#### 4.2 | Relation With Species

In [None]:
# Observing relationship type with corr values


sns.pairplot(data=df,
                  y_vars='Species',
                  x_vars=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
)
                  
plt.show()


In [None]:
pd.plotting.parallel_coordinates(df, "Species")
plt.show()

In [None]:
df.boxplot(by="Species", figsize=(14, 5))
plt.show()

#### 4.3 | Other Interesting Relationships
* SepalLengthCm shares strong relationship with PetalLengthCm and PetalWidthCm
* PetalWidthCm and PetalLengthCm have strong relationship among them.
* SepalLengthCm SepalWidthCm has no relationship

In [None]:
fig = px.scatter_matrix(
            df,
            dimensions=df.select_dtypes(include='number').columns,
            color="Species"
                        )
fig.update_layout(
    title='Scatterplot Matrix for all numerical Column',
    width=len(df.select_dtypes(include='number').columns)*225,
    height=len(df.select_dtypes(include='number').columns)*150,
)


fig.show()

In [None]:
res = [(a, b) for idx, a in enumerate(temp_corr.index) for b in temp_corr.index[idx + 1:]]

strong=[]
moderate=[]
weak=[]
no_relationship=[]
for x,y in res:
    p=temp_corr[x][y]
    if p >= 0.75 or p <= -0.75:
        strong.append((x,y))
    elif p >= 0.50 or p <= -0.50:
        moderate.append((x,y))
    elif p >= 0.25 or p <= -0.25:
        weak.append((x,y))
    else :
        no_relationship.append((x,y))
    
print('\nStrong -> ',strong)
print('\nModerate -> ',moderate)
print('\nWeak -> ',weak)
print('\nNo_relationship -> ',no_relationship)

### 5 | Preprocessing

In [None]:
# Split-out validation dataset
array = df.values
X = array[:,0:4]
Y = array[:,4]
 
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,test_size=0.20, random_state=5)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

### 6 | Models Selection

In [None]:
models = []
models.append(('LR', LogisticRegression())) 
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('KNN', KNeighborsClassifier())) 
models.append(('CART', DecisionTreeClassifier())) 
models.append(('NB', GaussianNB())) 
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') 
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
plt.boxplot(results) 
plt.show()

### 7 | Predictions

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
predictions = lda.predict(X_validation)
print(accuracy_score(Y_validation, predictions))

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))