In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# Understanding the Problem

* Given/Input will be SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
* Output: Species (three types so we will use multiple-classification)
* Logistic Regression will be used as our training model

In [None]:
data=pd.read_csv('/kaggle/input/iriscsv/Iris.csv')
data.head()

In [None]:
data.shape

In [None]:
data['Species'].value_counts()

**so data is not imbalanced**

In [None]:
data.isna().sum()

In [None]:
data=data.drop(columns=['Id'])
data.describe()

# Hot Encoding of the Species column

In [None]:
encoder=LabelEncoder()
encoder.fit(data['Species'])
data['Species']=encoder.fit_transform(data['Species'])
dictionary=dict(enumerate(encoder.classes_))
print(dictionary)
data.head()

# Visualising Data

In [None]:
sns.heatmap(data.corr(),annot=True)

In [None]:
df=data.copy()
df['Species']=df['Species'].map(dictionary)
sns.pairplot(df, hue='Species',diag_kind='kde')

In [None]:
data['SepalLengthCm'].hist(color='green',label='SepalLengthCm')
data['SepalWidthCm'].hist(color='yellow',label='SepalWidthCm')
data['PetalLengthCm'].hist(color='blue',label='PetalLengthCm')
data['PetalWidthCm'].hist(color='red',label='PetalWidthCm')

plt.title('Histogram of DataSet Features')
plt.xlabel('Features')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Analysis as per above plots
**Histogram:** <br>
1. PetalWidthCm have lowest values.
2. SepalLengthCm seems to have higher values than any other features.
3. PetalLengthCm seems to have highest range.
<br>
**PairPlot**<br>
* **Virginica** seems to have higher SepalLengthCm per SepalWidthCm and PetalWidthCm, and higher PetalLengthCm per SepalWidthCm and PetalWidthCm.
* **Versicolor** seems to have higher PetalLengthCm per PetalWidthCm and SepalWidthCm, and higher SepalLengthCm per SepalWidthCm and PetalWidthCm; however, have lower maximum values of all than Virginicia species' features' values.
* **Setosa** seems to have higher SepalLengthCm and SepalWidthCm values within smaller range of PetalLengthCm (0-2) and PetalWidthCm (0-1) 

# Scaling and Splitting of the Data

In [None]:
print('Range of Features before:')
print(data['SepalLengthCm'].max()-data['SepalLengthCm'].min())
print(data['SepalWidthCm'].max()-data['SepalWidthCm'].min())
print(data['PetalLengthCm'].max()-data['PetalLengthCm'].min())
print(data['PetalWidthCm'].max()-data['PetalWidthCm'].min())

In [None]:
X=data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']].values
Y=data[['Species']].values.flatten()

x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=36)
x_train[:5,],y_train[:5]

In [None]:
scaler=StandardScaler()
x_train_scale=scaler.fit_transform(x_train,y_train)
x_test_scale=scaler.fit_transform(x_test,y_test)

x_train_scale[:5,],x_test_scale[:5]

# Model Training

In [None]:
def accuracy_of_model(y_test, y_pred):
    print("Confusion Matrix =>\n{}".format(confusion_matrix(y_test,y_pred)))
    print('Accuracy Score => {}'.format(accuracy_score(y_test, y_pred)))

In [None]:
from sklearn.linear_model import LogisticRegression

mlogreg=LogisticRegression()
mlogreg.fit(x_train_scale,y_train)
y_train_pred=mlogreg.predict(x_train_scale)
print('Model evaluation for training data: ')
accuracy_of_model(y_train,y_train_pred)

y_test_pred=mlogreg.predict(x_test_scale)
print('\nModel evaluation for test data: ')
accuracy_of_model(y_test,y_test_pred)

# Plot of Predicted to Actual Values

In [None]:
print(dictionary)

fig,ax=plt.subplots(1,4,figsize=(14,4),sharey=True)
col=0
features=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
for triaxis in ax:
    triaxis.scatter(x_test_scale[:,col],y_test,color='blue',label='Actual')
    triaxis.scatter(x_test_scale[:,col],y_test_pred,color='orange',label='Predicted',marker='x')
    triaxis.set_xlabel(features[col])
    triaxis.legend(framealpha=1, frameon=True)
    col+=1
fig.text(0.07,0.5,'SPECIES',va='center',rotation='vertical',fontsize=12)
fig.text(0.35,0.95,'Predicted VS Actual Outputs of Test Data',va='center',rotation='horizontal',fontsize=12)
plt.show(); plt.close()