In [2]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
df = pd.read_csv("D:\\iris_dataset\\Iris.csv")

In [4]:
print(df.head(10))


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
5   6            5.4           3.9            1.7           0.4  Iris-setosa
6   7            4.6           3.4            1.4           0.3  Iris-setosa
7   8            5.0           3.4            1.5           0.2  Iris-setosa
8   9            4.4           2.9            1.4           0.2  Iris-setosa
9  10            4.9           3.1            1.5           0.1  Iris-setosa


In [5]:
print(df.shape)

(150, 6)


In [6]:
print("Missing Values: ",df.isnull().sum())

Missing Values:  Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [7]:
print("Initial Datatype:",df.dtypes)

Initial Datatype: Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


In [8]:
if 'Id' in df.columns:
    df.drop(columns=['Id'], inplace=True)

In [9]:

print("Descriptive statistics: ", df.describe())


Descriptive statistics:         SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count     150.000000    150.000000     150.000000    150.000000
mean        5.843333      3.054000       3.758667      1.198667
std         0.828066      0.433594       1.764420      0.763161
min         4.300000      2.000000       1.000000      0.100000
25%         5.100000      2.800000       1.600000      0.300000
50%         5.800000      3.000000       4.350000      1.300000
75%         6.400000      3.300000       5.100000      1.800000
max         7.900000      4.400000       6.900000      2.500000


In [10]:
print("Outlier detection using IQR: ")
outlier_report = {}
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_report[col] = outliers.shape[0]
    print(f"{col}: {outlier_report[col]} outliers")
    print("Shape before removal:",df.shape)
    # Remove outliers and update the dataframe
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print("Shape after removal:", df.shape)

Outlier detection using IQR: 
SepalLengthCm: 0 outliers
Shape before removal: (150, 5)
Shape after removal: (150, 5)
SepalWidthCm: 4 outliers
Shape before removal: (150, 5)
Shape after removal: (146, 5)
PetalLengthCm: 0 outliers
Shape before removal: (146, 5)
Shape after removal: (146, 5)
PetalWidthCm: 0 outliers
Shape before removal: (146, 5)
Shape after removal: (146, 5)


In [11]:
# Normalize 
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print(df.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0       0.222222      0.722222       0.067797      0.041667  Iris-setosa
1       0.166667      0.444444       0.067797      0.041667  Iris-setosa
2       0.111111      0.555556       0.050847      0.041667  Iris-setosa
3       0.083333      0.500000       0.084746      0.041667  Iris-setosa
4       0.194444      0.777778       0.067797      0.041667  Iris-setosa


In [12]:
df = pd.get_dummies(df, columns=['Species'], prefix='Species', dtype=int)

# Display result
print(df.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0       0.222222      0.722222       0.067797      0.041667   
1       0.166667      0.444444       0.067797      0.041667   
2       0.111111      0.555556       0.050847      0.041667   
3       0.083333      0.500000       0.084746      0.041667   
4       0.194444      0.777778       0.067797      0.041667   

   Species_Iris-setosa  Species_Iris-versicolor  Species_Iris-virginica  
0                    1                        0                       0  
1                    1                        0                       0  
2                    1                        0                       0  
3                    1                        0                       0  
4                    1                        0                       0  


In [13]:
#categorical to numeric
df = pd.read_csv("D:\\iris_dataset\\Iris.csv")
df['Species_original'] = df['Species']  

le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

print("Original species values (before encoding): ",df['Species_original'].unique())

print("\nEncoded species values (after encoding): ",(df['Species'].unique()))




Original species values (before encoding):  ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

Encoded species values (after encoding):  [0 1 2]


In [14]:
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder with sparse_output=False for dense output
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_values = encoder.fit_transform(df[['Species']])

# Convert the encoded values to a DataFrame
encoded_df = pd.DataFrame(encoded_values, columns=encoder.get_feature_names_out(['Species']))

# Combine with the original DataFrame if necessary
df_encoded = pd.concat([df, encoded_df], axis=1)

print(df_encoded)


      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species  \
0      1            5.1           3.5            1.4           0.2        0   
1      2            4.9           3.0            1.4           0.2        0   
2      3            4.7           3.2            1.3           0.2        0   
3      4            4.6           3.1            1.5           0.2        0   
4      5            5.0           3.6            1.4           0.2        0   
..   ...            ...           ...            ...           ...      ...   
145  146            6.7           3.0            5.2           2.3        2   
146  147            6.3           2.5            5.0           1.9        2   
147  148            6.5           3.0            5.2           2.0        2   
148  149            6.2           3.4            5.4           2.3        2   
149  150            5.9           3.0            5.1           1.8        2   

    Species_original  Species_0  Species_1  Species