In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('data/iris.csv')
df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# a. Insert missing and duplicate rows (for practice)
df.loc[0, 'sepal_length'] = None
df = pd.concat([df, df.iloc[[1]]])  # add duplicate

In [4]:
# Check missing and duplicates
missing = df.isnull().sum()
duplicates = df.duplicated().sum()
missing, duplicates

(sepal_length    1
 sepal_width     0
 petal_length    0
 petal_width     0
 species         0
 dtype: int64,
 np.int64(4))

In [5]:
# Handle missing and duplicate
df = df.drop_duplicates()
df = df.fillna(df.mean(numeric_only=True))

df.head()  # → after cleaning

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.861644,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:

# b. Merge with external species characteristics
species_info = pd.DataFrame({
    'species': ['setosa', 'versicolor', 'virginica'],
    'color': ['blue', 'purple', 'yellow'],
    'blooming_time': ['spring', 'summer', 'autumn']
})


In [7]:
df = pd.merge(df, species_info, on='species')
df.head()  # → after merge

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,color,blooming_time
0,5.861644,3.5,1.4,0.2,setosa,blue,spring
1,4.9,3.0,1.4,0.2,setosa,blue,spring
2,4.7,3.2,1.3,0.2,setosa,blue,spring
3,4.6,3.1,1.5,0.2,setosa,blue,spring
4,5.0,3.6,1.4,0.2,setosa,blue,spring


In [8]:
# c. Normalize sepal/petal measurements
scaler = StandardScaler()
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(
    df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)

In [9]:
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].head()  # → after normalization

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.0,1.019971,-1.357737,-1.3357
1,-1.167185,-0.128082,-1.357737,-1.3357
2,-1.409933,0.331139,-1.414778,-1.3357
3,-1.531307,0.101529,-1.300696,-1.3357
4,-1.045811,1.249582,-1.357737,-1.3357


In [10]:
# d. Add size_ratio = petal_length / sepal_length
df['size_ratio'] = df['petal_length'] / df['sepal_length']
df[['petal_length', 'sepal_length', 'size_ratio']].head()  # → with size_ratio

Unnamed: 0,petal_length,sepal_length,size_ratio
0,-1.357737,0.0,-inf
1,-1.357737,-1.167185,1.163257
2,-1.414778,-1.409933,1.003436
3,-1.300696,-1.531307,0.849402
4,-1.357737,-1.045811,1.298262
