In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
# Load dataset
iris = load_iris()
X = iris.data
y = iris.target


Spliting the Data into Training and Test Sets

In [3]:
from sklearn.model_selection import train_test_split

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


 Training a KNN Model

In [4]:
from sklearn.neighbors import KNeighborsClassifier

# Create KNN model with k=3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)


Evaluate Using Accuracy and F1 Score

In [5]:
from sklearn.metrics import accuracy_score, f1_score

# Predict on test set
y_pred = knn.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 1.00
F1 Score: 1.00


Simulate a categorical feature for one-hot encoding

In [6]:
# Load Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

In [7]:
X['petal size'] = pd.cut(X['petal length (cm)'], bins=3, labels=['small', 'medium', 'large'])


Simulate missing values in 'sepal length (cm)'

In [8]:
X.loc[0:10, 'sepal length (cm)'] = np.nan

Define numerical and categorical columns

In [9]:
numeric_features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
categorical_features = ['petal size']

missing values and standardize

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


one-hot encode 'petal size'

In [11]:
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

Full transformer

In [12]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

Apply transformations

In [13]:
X_transformed = preprocessor.fit_transform(X)


In [14]:
encoded_cat_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)
all_feature_names = numeric_features + list(encoded_cat_names)

feature names for final DataFrame

In [15]:
X_final = pd.DataFrame(X_transformed, columns=all_feature_names)

final DataFrame with updated features

In [16]:
X_final.to_csv("iris_updated.csv", index=False)
print("Transformation complete. File saved as 'iris_updated.csv'.")

Transformation complete. File saved as 'iris_updated.csv'.
