In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Cleaning the data

In [None]:
df = pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
df.head()

In [None]:
df.info()

In [None]:
df['species'].value_counts()

In [None]:
df['species']=df['species'].str.replace('Iris-','')

In [None]:
df['species'].value_counts()

In [None]:
missing_values = df.isnull().sum()
percentage_missing = (missing_values/len(df))*100
pd.DataFrame({'missing_values': missing_values,'percentage_missing': percentage_missing})

**All the data is already cleaned no need to do anything**

# 2. Data Visualization

In [None]:
green_palette = sns.color_palette("viridis", n_colors=3)
sns.pairplot(df,hue='species',palette=green_palette)
plt.show()

**Following graphs show the distribution of features for every specie**

In [None]:
num_columns = list(df.select_dtypes(include=['float']).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.histplot(data=df,x=num_columns[0],kde=True,bins=20,hue='species',ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Histograms of features', fontsize=16)
plt.show()

**Following graphs shows that the width is less tha the length for both petal and sepals**

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,6))
ax[0].plot(df['sepal_length'])
ax[0].plot(df['sepal_width'])
ax[0].set_title('Sepal length vs width')
ax[0].legend(['sepal_length','sepal_width'])
ax[1].plot(df['petal_length'])
ax[1].plot(df['petal_width'])
ax[1].set_title('Petal length vs width')
ax[1].legend(['petal_length','petal_width'])
plt.show()

**Following graphs show the variability in the features for every feature**

In [None]:
num_columns = list(df.select_dtypes(include=['float']).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.violinplot(data=df,x=num_columns[0],y='species',ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Boxplots of features', fontsize=16)
plt.show()

# 3. Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df['species'] = LabelEncoder().fit_transform(df['species'])
X = df.drop(['species'],axis=1)
y = df['species']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
lg = LogisticRegression()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()

**All the models are performing exceptionally because we have less testing data**

In [None]:
models = [lg,dt,knn]
for model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'Accuracy score of {model} is {accuracy_score(y_test,y_pred)}')
