In [1]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Anna', 'Ben', 'Cara', 'Dylan', 'Ella'],
    'Age': [22, np.nan, 25, 29, np.nan],
    'Score': [88, 92, np.nan, 79, 85]
}
df = pd.DataFrame(data)
print("Original Data:\n", df)


Original Data:
     Name   Age  Score
0   Anna  22.0   88.0
1    Ben   NaN   92.0
2   Cara  25.0    NaN
3  Dylan  29.0   79.0
4   Ella   NaN   85.0


In [2]:
# Check where missing
print("Missing values:\n", df.isnull().sum())

# Drop rows with any missing values
df_dropna = df.dropna()
print("Drop rows with NaNs:\n", df_dropna)

# Fill with specific value
df_fill0 = df.fillna(0)
print("Filled with 0:\n", df_fill0)

# Fill with column mean
df_mean = df.copy()
df_mean['Age'] = df_mean['Age'].fillna(df_mean['Age'].mean())
df_mean['Score'] = df_mean['Score'].fillna(df_mean['Score'].mean())
print("Filled with mean:\n", df_mean)

# Forward-fill (propagate previous values)
df_ffill = df.fillna(method='ffill')
print("Forward fill:\n", df_ffill)


Missing values:
 Name     0
Age      2
Score    1
dtype: int64
Drop rows with NaNs:
     Name   Age  Score
0   Anna  22.0   88.0
3  Dylan  29.0   79.0
Filled with 0:
     Name   Age  Score
0   Anna  22.0   88.0
1    Ben   0.0   92.0
2   Cara  25.0    0.0
3  Dylan  29.0   79.0
4   Ella   0.0   85.0
Filled with mean:
     Name        Age  Score
0   Anna  22.000000   88.0
1    Ben  25.333333   92.0
2   Cara  25.000000   86.0
3  Dylan  29.000000   79.0
4   Ella  25.333333   85.0
Forward fill:
     Name   Age  Score
0   Anna  22.0   88.0
1    Ben  22.0   92.0
2   Cara  25.0   92.0
3  Dylan  29.0   79.0
4   Ella  29.0   85.0


  df_ffill = df.fillna(method='ffill')


In [3]:
# Create synthetic data
np.random.seed(42)
normal_scores = np.random.normal(loc=70, scale=10, size=100)
# Inject outliers
normal_scores[95:] = [150, 160, 170, 180, 190]

df_outliers = pd.DataFrame({'Score': normal_scores})

# IQR method
Q1 = df_outliers['Score'].quantile(0.25)
Q3 = df_outliers['Score'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print("Outlier bounds:", lower, upper)

# Filter out outliers
df_no_outliers = df_outliers[(df_outliers['Score'] >= lower) & (df_outliers['Score'] <= upper)]
print("Cleaned data:\n", df_no_outliers.describe())


Outlier bounds: 45.647371286123644 95.33770600551577
Cleaned data:
            Score
count  94.000000
mean   69.294777
std     8.862589
min    50.124311
25%    64.089491
50%    68.730437
75%    75.091291
max    88.522782
