In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load Iris dataset directly
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df = pd.read_csv(url)
print("Original Data:")
print(df.head())

# Intentionally introduce missing values for demo
np.random.seed(42)
missing_idx = np.random.choice(df.index, size=10, replace=False)
df.loc[missing_idx, 'sepal_length'] = np.nan


Original Data:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [3]:
# Info and statistics
print(df.info())
print(df.describe())

# Check missing values
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  140 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
       sepal_length  sepal_width  petal_length  petal_width
count    140.000000   150.000000    150.000000   150.000000
mean       5.816429     3.057333      3.758000     1.199333
std        0.831451     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.750000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.50000

In [8]:
# 4.1 Handle missing values

# Fill missing 'sepal_length' with median using direct assignment
df['sepal_length'] = df['sepal_length'].fillna(df['sepal_length'].median())

# 4.2 Remove duplicates (if any)
df = df.drop_duplicates()

# 4.3 Convert categorical variable 'species' to numeric codes
df['species'] = df['species'].astype('category').cat.codes


In [10]:
print("After cleaning, missing values:\n", df.isnull().sum())
print(df.head())
print(df.info())

print("1) Main issues found: missing values in 'sepal_length' and categorical variable 'species'.")
print("2) Importance: Handling missing values prevents errors, biases, and unreliable results.")
print("3) Risks if skipped: Incorrect model predictions, misleading analysis, and failed algorithms.")


After cleaning, missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64
   sepal_length  sepal_width  petal_length  petal_width  species
0           5.1          3.5           1.4          0.2        0
1           4.9          3.0           1.4          0.2        0
2           4.7          3.2           1.3          0.2        0
3           4.6          3.1           1.5          0.2        0
4           5.0          3.6           1.4          0.2        0
<class 'pandas.core.frame.DataFrame'>
Index: 149 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  149 non-null    float64
 1   sepal_width   149 non-null    float64
 2   petal_length  149 non-null    float64
 3   petal_width   149 non-null    float64
 4   species       149 non-null    int8   
dtypes: float64(4), int8(1)
memory usage: 6.0 KB
None
1) Main issues found