## 1. import all required python libraries

In [None]:
!pip install pandas numpy matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install kagglehub

In [None]:
import kagglehub

## 2. Locate open source dataset

In [None]:
path = kagglehub.dataset_download('uciml/iris')

In [None]:
print("path to dataset: ", path) 

## 3. Load the dataset in pandas dataframe

In [None]:
df = pd.read_csv(path + "/Iris.csv")

## 4. Display the initial statistics

In [None]:
print(df.head())

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

## 5. Scanning for inconsistencies

In [None]:
df.isnull().sum()

In [None]:
df.describe(include='all')

## iris dataset has no missing values or inconsistencies. 

below are few methods to deal with inconsistencies in the dataset if they are present.

1. Imputation: process of filling in missing values in a dataset.
    - Simple Imputation: filling in missing values with mean, median or mode
    - KNN Imputation: filling in missing values with avg of the k nearest neighbours. 

### Simple Imputation using sklearn SimpleImputer

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.impute import SimpleImputer

data = np.array([
    [1, 2, np.nan],
    [4, np.nan, 6],
    [np.nan, 8, np.nan]
])

imputer = SimpleImputer(strategy='mean')

imputed_data = imputer.fit_transform(data)

print("Original data:\n", data)
print("Imputed data:\n", imputed_data)

### KNN imputation using sklearn KNNImputer

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors = 2)
knn_imputed_data = knn_imputer.fit_transform(data)

print("KNN Imputed Data:\n", knn_imputed_data)

## 6. Outliers detection and handling

In [None]:
nc = df.drop(columns=['Id', 'Species']) # nc is numerical columns
Q1 = nc.quantile(0.25)
Q3 = nc.quantile(0.75)
IQR = Q3 - Q1
outliers = ((nc < Q1 - IQR * 1.5 ) | (nc > Q3 + 1.5 * IQR)).any(axis=1)
print(outliers)

### There are no outliers in the iris data. 

Below is a sample outlier detection example.

In [None]:
sample_df = pd.DataFrame({
    'col1': [10, 20, 3000, 25],
    'col2': [23, 4000, 30, 20]
})
q1 = sample_df.quantile(0.25)
q3 = sample_df.quantile(0.75)
iqr = q3 - q1
lowerbound = q1 - 1.5 * iqr 
upperbound = q3 + 1.5 * iqr 
outliers = ((sample_df < lowerbound) | (sample_df > upperbound)).any(axis=1)
print(outliers)

## Transformation / Scaling
1. Z Score
2. decimal scaling
3. min max scaling

In [None]:
data = pd.DataFrame({
    'feature': [20, 40, 60, 80, 100]
})

# z score normalization
mean = data['feature'].mean()
std = data['feature'].std()
data['z_score_normalized'] = ((data['feature']-mean)/std)

# decimal scaling
data['decimal_scaled'] = data['feature']/1e2 # 2 over here is j in x / 10 ^ j

# min max scaling
x_min = data['feature'].min()
x_max = data['feature'].max()
data['min_max_scaled'] = (data['feature'] - x_min)/(x_max - x_min)

print(data)

## 7. Transformation on one column of iris dataset

In [None]:
df_transformed = df.copy()
df_transformed["min_max_scaled_sepal_length"] = (df_transformed['SepalLengthCm'] - df_transformed['SepalLengthCm'].min()) / (df_transformed['SepalLengthCm'].max() - df_transformed['SepalLengthCm'].min())
df_transformed.head()

## 8. Turn Calegorical Variables into quantitative variables in python
1. One Hot Encoding - creates binary columns for each unique category.
2. Label Encoding - assigns a unique integer to each category.

In [None]:
from sklearn.preprocessing import LabelEncoder

new_df = pd.DataFrame({
    'color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],
    'size' : ['S', 'M', 'L', 'M', 'S']
})

one_hot_encoded = pd.get_dummies(new_df, columns=['color', 'size'])
label_encoder = LabelEncoder()
new_df_label_encoded = new_df.copy()
new_df_label_encoded['color'] = label_encoder.fit_transform(new_df['color'])
new_df_label_encoded['size'] = label_encoder.fit_transform(new_df['size'])

print("original df:\n", new_df)
print("one hot encoded df:\n", one_hot_encoded)
print("label encoded:\n", new_df_label_encoded)