In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
%matplotlib inline
import warnings

warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Machine+Learning+R/iris-machinelearning.png)

**About the IRIS dataset**



**Source**: The Iris dataset was introduced by the British biologist and statistician Ronald A. Fisher in 1936 as an example of discriminant analysis.

**Data**: The dataset contains measurements of 150 iris flowers, with four features each: sepal length, sepal width, petal length, and petal width.

**Classes**: The flowers belong to three different species: Setosa, Versicolor, and Virginica.

**Balance**: Each class in the dataset has an equal number of 50 samples.

**Usage**: The Iris dataset is commonly used as a beginner's dataset for practicing machine learning classification algorithms.

**Exploratory Data Analysis (EDA)**: EDA is often performed on the Iris dataset to understand the distribution of features, relationships between variables, and class distributions.

**Visualization**: Visualizations like pair plots, box plots, and scatter plots are frequently used to explore the dataset.

**Machine Learning**: The dataset is popular for testing and demonstrating classification algorithms due to its simplicity and clear class separations.

**Versatility**: Despite its simplicity, the Iris dataset remains relevant and is used in various machine learning tutorials, courses, and competitions.

**Contribution**: Fisher's work on the Iris dataset laid the foundation for modern statistical techniques and machine learning, making it a historically significant dataset in the field.

**Import the data**

In [None]:
df = pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
df.shape

**Check head of data**

In [None]:
df.head()

**check column names**

In [None]:
df.columns

**check target column values**

In [None]:
# check distribution of target_class column
df['species'].value_counts()

**Summary statistics**

In [None]:
df.info()

**Missing values check**

![](https://images.squarespace-cdn.com/content/v1/5e714fa1f7d532536a90c6e0/ec3a30a5-f850-497e-9cb0-774f29cb5553/missing+values.png)

In [None]:
# check for missing values in variables

df.isnull().sum()

In [None]:
round(df.describe(),2)

****EDA****

![](https://editor.analyticsvidhya.com/uploads/61798ti2.png)

In [None]:
# Pairplot to visualize relationships between features
sns.pairplot(df, hue='species', markers=['o', 's', 'D'])
plt.show()

In [None]:
# Violin plots for each feature
plt.figure(figsize=(14, 8))
for i, column in enumerate(df.columns[:-1]):
    plt.subplot(2, 2, i + 1)
    sns.violinplot(x='species', y=column, data=df, inner='quartile')
    plt.title(f'{column} distribution by species')
plt.show()

In [None]:
# Swarm plots for each feature
plt.figure(figsize=(12, 8))
for i, column in enumerate(df.columns[:-1]):
    plt.subplot(2, 2, i + 1)
    sns.swarmplot(x='species', y=column, data=df)
    plt.title(f'{column} distribution by class')
plt.show()

In [None]:
# draw boxplots to visualize outliers

plt.figure(figsize=(15,15))


plt.subplot(4, 2, 1)
fig = df.boxplot(column='sepal_length')
fig.set_title('')
fig.set_ylabel('sepal_length')


plt.subplot(4, 2, 2)
fig = df.boxplot(column='sepal_width')
fig.set_title('')
fig.set_ylabel('sepal_width')


plt.subplot(4, 2, 3)
fig = df.boxplot(column='petal_length')
fig.set_title('')
fig.set_ylabel('petal_length')


plt.subplot(4, 2, 4)
fig = df.boxplot(column='petal_width')
fig.set_title('')
fig.set_ylabel('petal_width')

In [None]:
# plot histogram to check distribution


plt.figure(figsize=(24,20))


plt.subplot(4, 2, 1)
fig = df['sepal_length'].hist(bins=10)
fig.set_xlabel('IP Mean')



plt.subplot(4, 2, 2)
fig = df['sepal_width'].hist(bins=10)
fig.set_xlabel('sepal_width')



plt.subplot(4, 2, 3)
fig = df['petal_length'].hist(bins=10)
fig.set_xlabel('petal_length')



plt.subplot(4, 2, 4)
fig = df['petal_width'].hist(bins=10)
fig.set_xlabel('petal_width')


**Get X and Y**

In [None]:
X = df.drop(['species'], axis=1)

y = df['species']

![](https://miro.medium.com/v2/resize:fit:580/1*OECM6SWmlhVzebmSuvMtBg.png)

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

In [None]:
X_train.shape, X_test.shape

**Feature Scaling**

use of Standard Scaler and feature scaling for SVM modeling on the Iris dataset:

Motivation: 💡 Standard Scaler is a key player in ML pipelines, harmonizing dataset features. For SVM models, particularly those finicky about feature scales, scaling is a game-changer.

Standard Scaler: 📏 The Standard Scaler dances with data, aligning it to the mean and grooving it to a standard deviation of 1. This ensures that features share the same vibes, preventing some from stealing the spotlight.

Effect on SVM: 🤖 Support Vector Machines (SVMs) are choosy about feature scales. In SVM, the decision boundary sways with the feature scale, and having features on a similar scale unleashes the algorithm's full potential.

Preprocessing Step: 🚀 Scaling takes the stage as a crucial preprocessing step, especially in the SVM arena, where the algorithm's magic relies on the distance between data points. 📊

In [None]:
cols = X_train.columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [None]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
X_train.head()

Default hyperparameter: C=1.0, kernel=rbf and gamma=auto

![](https://i.ytimg.com/vi/ny1iZ5A8ilA/maxresdefault.jpg)

In [None]:
# import SVC classifier
from sklearn.svm import SVC

# import metrics to compute accuracy
from sklearn.metrics import accuracy_score

# instantiate classifier with default hyperparameters
svc=SVC() 

# fit classifier to training set
svc.fit(X_train,y_train)

# make predictions on test set
y_pred=svc.predict(X_test)

# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.2f}'. format(accuracy_score(y_test, y_pred)))

**compare the train-set and test-set accuracy to check for overfitting**

In [None]:
y_pred_train = svc.predict(X_train)

y_pred_train

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))


Overfitting occurs when a model performs very well on the training data but poorly on new, unseen data.
This can happen when the model is too complex and captures noise or specific patterns in the training data that don't generalize well.
Underfitting:
Underfitting occurs when a model performs poorly on both the training and test data.
It suggests that the model is too simple to capture the underlying patterns in the data.

Ideally, you want the test-set accuracy to be close to the train-set accuracy. If the model generalizes well, both accuracies should be similar.
A significant drop in test-set accuracy compared to train-set accuracy may indicate overfitting.The scores here are similar and satisfactory

**Compare model accuracy with null accuracy**

So, the model accuracy is 1.00. But, we cannot say that our model is perfect based on the above accuracy. We must compare it with the null accuracy. Null accuracy is the accuracy that could be achieved by always predicting the most frequent class. In our dataset all the 3 classes have equal frequency hence not needed.

Using Confusion Matrix

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu')

Classification Report

Classification report is another way to evaluate the classification model performance. It displays the precision, recall, f1 and support scores for the model

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.3f}'.format(classification_accuracy))

In [None]:
# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


kfold=KFold(n_splits=5, shuffle=True, random_state=0)


linear_svc=SVC()


linear_scores = cross_val_score(linear_svc, X, y, cv=kfold)
# print cross-validation scores with linear kernel

print('cross-validation scores with linear kernel:\n\n{}'.format(linear_scores))
print('mean cross-validation scores with linear kernel:\n\n{}'.format(linear_scores.mean()))

*Conclusion*

Our standard model of SVM with default parameters of SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
gave us 100 percent accuracy. We also checked for overfitting which is not present.We also applied KFOLD Cross validation technique which gave us a mean score of 0.94

![](https://t3.ftcdn.net/jpg/02/91/52/22/360_F_291522205_XkrmS421FjSGTMRdTrqFZPxDY19VxpmL.jpg)