In [59]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix


## Begin by reading the Iris dataset (Iris.csv) into a Jupyter notebook and name it iris_logistic_regression.ipynb.

In [44]:
# Import the dataset
df = pd.read_csv("Iris.csv")

In [45]:
#BASIC CHECKS#

#check no of rows and columns
print(f"\nSize of rows/columns: {df.shape}\n")

#check details of the dataset
print(df.info(),"\n")

#checking for null values
print(f"\nChecking for if there are null values: \n{df.isnull().sum()}")

# Check the 1st 5 values in the dataset
print(f"\nChecking the first 5 Rows and all columns of data: \n{df.head(150)}\n")

a= df['Species'].unique()
print(a)


Size of rows/columns: (150, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None 


Checking for if there are null values: 
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Checking the first 5 Rows and all columns of data: 
      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0      1            5.1           3.5            1.4           0.2   
1      2            4.9           3.0            1.4           0.2   
2      3   

#### Observation:
##### 1 - There are 150 rows and 6 columns
##### 2 - The datatype for all columns is numerical except 'Species' which seems to be categorical data(Object)
##### 3 - There is no null values in the data
##### 4 - There is no obvious errors in the dataframe

### The dataset consists of three classes of irises. The objective is to create a classifier that will predict whether an iris belongs to the ‘Iris-setosa' class or not. This means that we have two classes: ‘Iris-setosa' and not-‘Iris-setosa’ (which includes ' Iris-versicolour' and 'Iris-virginica').
#### ○ Identify your independent variable x.
#### ○ Encode your dependent variable y such that ‘Iris-setosa' is encoded as 0, and 'Iris-versicolour' and 'Iris-virginica' are both encoded as
##### (0 corresponds to the 'Iris-setosa' class, and 1 corresponds to the not-‘Iris-setosa' class.)

In [46]:
#Independent variables for x


#transforming the type object to intergers for logistic regression
df['Species'] = LabelEncoder().fit_transform(df['Species'] != "Iris-setosa")

#gettign the unique values in Species
encode_worked = df['Species'].unique()

#checking if the encoding worked
print(encode_worked)

[0 1]


In [47]:
#check details of the dataset
print(df.info(),"\n")

#checking if LabelEncoder has worked
df.head(150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB
None 



Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,1
146,147,6.3,2.5,5.0,1.9,1
147,148,6.5,3.0,5.2,2.0,1
148,149,6.2,3.4,5.4,2.3,1


In [48]:
#Seperated the Data into X and y for futher processing
X = df.drop('Species', axis=1).values
y = df['Species'].values
#Checking data is getting stored properly in x
X[:4]

array([[1. , 5.1, 3.5, 1.4, 0.2],
       [2. , 4.9, 3. , 1.4, 0.2],
       [3. , 4.7, 3.2, 1.3, 0.2],
       [4. , 4.6, 3.1, 1.5, 0.2]])

In [53]:
# Select and reshape data
y = y.reshape(-1, 1)
X = X.reshape(-1, X.shape[1])

print ("data:",X.shape, y.shape)

data: (150, 5) (150, 1)


## Split the data into a training and test set.

In [54]:
#TRAINING AND TESTS

# Split data into training and test sets
rseed = 23 # Use the same random seed for learning purposes to get the same result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,  random_state=rseed)

#Confirming if the data has been split correctly
print ("Training data:",X_train.shape, y_train.shape)
print ("Test data:",X_test.shape, y_test.shape)

Training data: (120, 5) (120, 1)
Test data: (30, 5) (30, 1)


## Use sklearn’s logistic regression function to fit a model and make predictions on the test set.

In [90]:
# fit a model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train.ravel())

# make predictions on test data
y_pred = log_reg.predict(X_test).reshape(-1,1)

## Use sklearn to generate a confusion matrix, which compares the predicted labels to the actual labels (gold labels).

In [94]:
classes = np.unique(y_test)

#comparing the predicted lables with actual lables
conf_mat = confusion_matrix(y_test, y_pred)
#Creating a dataframe for output with Labels 0("Iris-setosa") and 1('Iris-versicolour' and 'Iris-virginica')
matrix_df = pd.DataFrame(conf_mat, columns=classes, index=classes)
print(f"Comparison:\n{matrix_df}\nAbove comparison 0 ('Iris-setosa') and 1 ('Iris-versicolour' or 'Iris-virginica')")

Comparison:
    0   1
0  12   0
1   0  18
Above comparison 0 ('Iris-setosa') and 1 ('Iris-versicolour' or 'Iris-virginica')


## Analyse the confusion matrix and provide a prediction, in a comment, on whether the model is likely to have higher precision, higher recall, or similar precision and recall.

Based on the data and since its a dataframe of 2 columns with limited data, and based on the confusion matrix having values in a diagonal line which seems to be generally correct. the model seem to be correct

## Write your own code to calculate the accuracy, precision, and recall, and check whether your prediction was right

In [89]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Accuracy, precision and recall for Iris-setosa  
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)


accuracy: 1.0
Precision: 1.0
Recall: 1.0


The above data confirms that the predictions was correct