In [126]:
import numpy as np
import matplotlib as plt
import pandas as pd

In [127]:
# load df
data = pd.read_csv('data_section2/iris_to_clean.csv')
# 154 rows

In [128]:
# inspect duplicated values
col = 'measurement.number'
print(f'Number of duplicates in {col}: {data.duplicated(col).sum()}')
print(f'Number of duplicates in entire df: {data.duplicated().sum()}')

Number of duplicates in measurement.number: 4
Number of duplicates in entire df: 4


In [129]:
# delete duplicates
data = data.drop_duplicates()
# or data.drop_duplicates(inplace=True)
# 150 rows

In [130]:
# create df with missing values
missing_values = data.isnull()

In [131]:
# sort by missing values 
missing_values.sum().sort_values(ascending=False)

petal.length          2
sepal.width           1
sepal.length          0
measurement.number    0
petal.width           0
variety               0
dtype: int64

In [132]:
# show rows with missing values 
data[missing_values.any(axis=1)].head()

Unnamed: 0,measurement.number,sepal.length,sepal.width,petal.length,petal.width,variety
26,27,5.0,,1.6,0.4,Setosa
36,37,5.5,3.5,,0.2,Setosa
134,131,7.4,2.8,,1.9,Virginica


In [133]:
# calculate missing values percentage per column
missing_values.mean().sort_values(ascending=False)

petal.length          0.013333
sepal.width           0.006667
sepal.length          0.000000
measurement.number    0.000000
petal.width           0.000000
variety               0.000000
dtype: float64

In [134]:
# print percentage of missing values per column
print('Percentage of missing values per column :')
for col in data.columns:
    pct_missing = data[col].isnull().mean()
    print(f'{col} - {pct_missing:.2%}')

Percentage of missing values per column :
measurement.number - 0.00%
sepal.length - 0.00%
sepal.width - 0.67%
petal.length - 1.33%
petal.width - 0.00%
variety - 0.00%


In [135]:
# drop rows with missing values
data.dropna(inplace=True)
# 147 rows

In [136]:
# find outliers for each column
for col in ['sepal.length','sepal.width','petal.length','petal.width']: 
    q1 = data[col].quantile(0.25)
    q2 = data[col].quantile(0.75)
    # median
    median = data[col].quantile(0.5)
    iqr = q2 - q1
    # lower and upper bounds
    lower_bound = q1 - 1.5*iqr
    upper_bound = q2 + 1.5*iqr
    
    # find outliers
    outliers = (data[col] < lower_bound) | (data[col] > upper_bound)
    print(f'Column {col} has {outliers.sum()} outliers')

    
    # remove outliers
    data_cleaned = data[~outliers]
    # 146 rows

Column sepal.length has 1 outliers
Column sepal.width has 5 outliers
Column petal.length has 1 outliers
Column petal.width has 1 outliers
