# Iris Dataset: EDA and Classification

## Iris Dataset: Simple Exploratory Data Analysis (EDA)

#### Import Modules

In [None]:
import numpy as np # operasi numerik
import pandas as pd # olah dan analisis data
import matplotlib.pyplot as plt # visualisasi data
import seaborn as sns # visualisasi data
%matplotlib notebook 

#### Load dataset

In [None]:
iris_df = pd.read_csv('./input/iris/Iris.csv') # memuat file csv sebagai data frame
iris_df.head(10) # tampilkan 10 baris pertama

#### Drop column 'Id'

In [None]:
# iris_df = iris_df.drop(columns='Id')
iris_df.drop(columns='Id', inplace=True) # menghapus kolom bernama 'Id'
iris_df.head() # tampilkan 5 baris pertama

#### Identify the shape of the datatset

In [None]:
iris_df.shape # bentuk/dimensi dataset

#### Get the list of columns

In [None]:
# iris_df.keys()
iris_df.columns # daftar nama kolom

#### Identify data types for each column

In [None]:
iris_df.dtypes # tipe data untuk tiap kolom

#### Get bassic dataset information

In [None]:
iris_df.info() # informasi dataset

#### Identify missing values

In [None]:
# iris_df.isnull().values.any()
iris_df.isna().values.any() # mendeteksi keberadaan nilai kosong

#### Identify duplicate entries/rows

In [None]:
# iris_df[iris_df.duplicated(keep=False)] # tampilkan seluruh baris dengan duplikasi
iris_df[iris_df.duplicated()] # tampilkan hanya baris duplikasi sekunder

In [None]:
iris_df.duplicated().value_counts()

#### Drop duplicate entries/rows

In [None]:
iris_df.drop_duplicates(inplace=True)

#### Describe the dataset

In [None]:
iris_df.describe() # deskripsi data

#### Correlation

In [None]:
iris_df.corr() # korelasi antar kolom

## Iris Dataset: Data Visualisation

#### Heatmap

In [None]:
sns.heatmap(data=iris_df.corr())

#### Bar Plot

In [None]:
iris_df['Species'].value_counts()

In [None]:
iris_df["Species"].value_counts().plot.bar()
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x='Species', data=iris_df)
plt.tight_layout()

#### Pie Chart

In [None]:
iris_df['Species'].value_counts().plot.pie(autopct='%1.1f%%', labels=None, legend=True)
plt.tight_layout()

#### Line Plot

In [None]:
fig,ax = plt.subplots(nrows=2, ncols=2, figsize=(8,8))

iris_df['SepalLengthCm'].plot.line(ax=ax[0][0])
ax[0][0].set_title('Sepal Length')

iris_df['SepalWidthCm'].plot.line(ax=ax[0][1])
ax[0][1].set_title('Sepal Width')

iris_df.PetalLengthCm.plot.line(ax=ax[1][0])
ax[1][0].set_title('Petal Length')

iris_df.PetalWidthCm.plot.line(ax=ax[1][1])
ax[1][1].set_title('Petal Width')

In [None]:
iris_df.hist(figsize=(8,8))
plt.tight_layout()

In [None]:
iris.boxplot(figsize=(10,10))

In [None]:
iris.plot(figsize=(12,12))

In [None]:
iris.boxplot(by="Species",figsize=(12,12))

In [None]:
sns.pairplot(iris,hue="Species")

In [None]:
sns.scatterplot(x="SepalLengthCm",y="SepalWidthCm",data=iris,hue="Species")

## Iris Dataset: Classification Models

**Import the Libraries**

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
X=iris.iloc[:,:-1]
Y=iris.iloc[:,-1:]

In [None]:
ohe=OneHotEncoder()
LE=LabelEncoder()
Y=LE.fit_transform(Y)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=2)
print("Train data size",x_train.shape,y_train.shape)
print("Test data size",x_test.shape,y_test.shape)

**Logistic Regression**

In [None]:
model=LogisticRegression()
model.fit(x_train,y_train)
pred=model.predict(x_test)


In [None]:
print("accuracy Score:\n",accuracy_score(pred,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred,y_test))
print("Classification Report:\n",classification_report(pred,y_test))

**Support Vector Classifier**

In [None]:
model1=SVC()
model1.fit(x_train,y_train)
pred1=model1.predict(x_test)

In [None]:
print("accuracy Score:\n",accuracy_score(pred,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred,y_test))
print("Classification Report:\n",classification_report(pred,y_test))

**Grid SearchCV**

to increase accuracy of the model.

In [None]:
param={'C':[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5],
      'kernel': ["linear","rbf"],
      "gamma":[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5]}
               
grid_svc=GridSearchCV(model1,param_grid=param,scoring="accuracy",cv=10)
grid_svc.fit(x_train,y_train)

In [None]:
grid_svc.best_params_

In [None]:
gridsearch_svc=SVC(C=0.8,gamma=0.1,kernel='linear')
gridsearch_svc.fit(x_train,y_train)
pred_grid=gridsearch_svc.predict(x_test)
print("accuracy Score:\n",accuracy_score(pred_grid,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred_grid,y_test))
print("Classification Report:\n",classification_report(pred_grid,y_test))

**Decision Tree Classifier**

In [None]:
model_dt=DecisionTreeClassifier()
model_dt.fit(x_train,y_train)
pred_dt=model_dt.predict(x_test)

In [None]:
print("accuracy Score:\n",accuracy_score(pred_dt,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred_dt,y_test))
print("Classification Report:\n",classification_report(pred_dt,y_test))

**Random Forest Classifier**

In [None]:
model_rf=RandomForestClassifier(n_jobs=3)
model_rf.fit(x_train,y_train)
pred_rf=model_rf.predict(x_test)

In [None]:
print("accuracy Score:\n",accuracy_score(pred_rf,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred_rf,y_test))
print("Classification Report:\n",classification_report(pred_rf,y_test))

**K Nearest Neighbors**

In [None]:
model_knn=KNeighborsClassifier(n_neighbors=3)
model_knn.fit(x_train,y_train)
pred_knn=model_knn.predict(x_test)

In [None]:
print("accuracy Score:\n",accuracy_score(pred_knn,y_test))
print("Confusion Matrix:\n",confusion_matrix(pred_knn,y_test))
print("Classification Report:\n",classification_report(pred,y_test))

Accuracy comparision for various models.

In [None]:
models=[model,model1,gridsearch_svc,model_dt,model_rf,model_knn]
accuracy_scores=[]
for i in models:
    pred=i.predict(x_test)
    accuracy=accuracy_score(pred,y_test)
    accuracy_scores.append(accuracy)
print(accuracy_scores)    
plt.bar(['LogReg','SVM','GridSVC','DT','RF','KNN'],accuracy_scores)
plt.ylim(0.90,1.01)
plt.title("Accuracy comparision for various models",fontsize=15,color='r')
plt.xlabel("Models",fontsize=18,color='g')
plt.ylabel("Accuracy Score",fontsize=18,color='g')
plt.show()
    

> Updating.....

**Thankyou for visit the kernel. If you have any suggustion please comment.if you feel the kernel helpful,please upvote. **