# Data Cleaning
    1. Handling duplicated entries
        - drop the duplicated rows
        
    2. Handling missing data
        - drop the rows having more than 60% of data missing
        - drop the columns having more than 80% of data missing
        - for the columns having less 10% of data missing - perform statistical imputation
            - for categorical - nominal = fill by mode
                              - ordinal = fill by median
            - for numeric - skewed - fill by median
                          - not skewed - fill by mean
                          
        - for the columns having more than 10% of data missing - ML based imputations
        - In case missing values are natural, not bcs of system/human errors - convert the column into categorical / binary
        - for columns having more than 30% of data missing - convert to categorical
    3. Handling unwanted columns
    4. Handling outliers

### Handling duplicated entries

In [26]:
import pandas as pd

In [27]:
#load a dataset
df = pd.read_csv(r"C:\Users\aspdi\Downloads\datasets-1\datasets-1\datawh_missing.csv")
df.shape

(23, 7)

In [28]:
df.head()

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Day id,Vibration
0,30-04-2018,218,182,4.0,2.0,1,45
1,01-05-2018,?,182,3.0,2.0,2,56
2,02-05-2018,.,439,,0.0,3,45
3,03-05-2018,2439,53,5.0,1.0,4,23
4,04-05-2018,824,444,5.0,,5,35


In [29]:
# to check the count of duplicates
df.duplicated().sum()

2

In [30]:
# to see the duplicated rows
df[df.duplicated(keep=False)]

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Day id,Vibration
19,19-05-2018,766,535,3,2,20,39
20,19-05-2018,766,535,3,2,20,39
21,19-05-2018,766,535,3,2,20,39


In [31]:
# to drop duplicated rows
df.drop_duplicates(inplace=True,keep='last')
df.shape

(21, 7)

In [32]:
df.duplicated().sum()

0

### Handling missing data

In [33]:
#check for missing values
df.isnull().sum()

Dates          0
Temperature    0
Humidity       0
Pressure       2
Air Quality    1
Day id         0
Vibration      0
dtype: int64

In [34]:
df['Temperature'] = df.Temperature.apply(pd.to_numeric,errors='coerce')
df['Humidity'] = df.Humidity.apply(pd.to_numeric,errors='coerce')
df['Pressure'] = df.Pressure.apply(pd.to_numeric,errors='coerce')
df['Air Quality'] = df['Air Quality'].apply(pd.to_numeric,errors='coerce')

In [35]:
#check for missing values
df.isnull().sum()

Dates          0
Temperature    7
Humidity       3
Pressure       7
Air Quality    2
Day id         0
Vibration      0
dtype: int64

In [36]:
df

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Day id,Vibration
0,30-04-2018,218.0,182.0,4.0,2.0,1,45
1,01-05-2018,,182.0,3.0,2.0,2,56
2,02-05-2018,,439.0,,0.0,3,45
3,03-05-2018,2439.0,53.0,5.0,1.0,4,23
4,04-05-2018,824.0,444.0,5.0,,5,35
5,05-05-2018,1744.0,,5.0,1.0,6,26
6,06-05-2018,786.0,,5.0,1.0,7,25
7,07-05-2018,1326.0,309.0,,1.0,8,26
8,08-05-2018,1804.0,188.0,,2.0,9,25
9,09-05-2018,,420.0,0.0,1.0,10,35


In [37]:
# drop the rows having more than 60% of data missing - rows having less than 40% of real data
0.4*7

2.8000000000000003

In [38]:
print(df.shape)
df.dropna(thresh=4,inplace=True)
print(df.shape)

(21, 7)
(20, 7)


In [39]:
df

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Day id,Vibration
0,30-04-2018,218.0,182.0,4.0,2.0,1,45
1,01-05-2018,,182.0,3.0,2.0,2,56
2,02-05-2018,,439.0,,0.0,3,45
3,03-05-2018,2439.0,53.0,5.0,1.0,4,23
4,04-05-2018,824.0,444.0,5.0,,5,35
5,05-05-2018,1744.0,,5.0,1.0,6,26
6,06-05-2018,786.0,,5.0,1.0,7,25
7,07-05-2018,1326.0,309.0,,1.0,8,26
8,08-05-2018,1804.0,188.0,,2.0,9,25
9,09-05-2018,,420.0,0.0,1.0,10,35


In [40]:
df.skew()

Temperature    0.047677
Humidity      -0.469442
Pressure      -0.780891
Air Quality   -0.410217
Day id         0.108418
Vibration      2.506968
dtype: float64

     - if skewness is between -0.1 to +0.1 = almost normal distribution - not skewed - mean
    - else - skewed - median

In [41]:
df.Temperature.fillna(df.Temperature.mean(),inplace=True)
df.Humidity.fillna(df.Humidity.median(),inplace=True)
df.Pressure.fillna(df.Pressure.median(),inplace=True)
df['Air Quality'].fillna(df['Air Quality'].median(),inplace=True)

In [42]:
df.isnull().sum()

Dates          0
Temperature    0
Humidity       0
Pressure       0
Air Quality    0
Day id         0
Vibration      0
dtype: int64

## Handling unwanted columns

In [43]:
df.head()

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Day id,Vibration
0,30-04-2018,218.0,182.0,4.0,2.0,1,45
1,01-05-2018,1579.714286,182.0,3.0,2.0,2,56
2,02-05-2018,1579.714286,439.0,3.5,0.0,3,45
3,03-05-2018,2439.0,53.0,5.0,1.0,4,23
4,04-05-2018,824.0,444.0,5.0,1.0,5,35


In [44]:
# dropping a column
df.drop(columns=['Day id'],inplace=True)
df.shape

(20, 6)

In [45]:
df.head()

Unnamed: 0,Dates,Temperature,Humidity,Pressure,Air Quality,Vibration
0,30-04-2018,218.0,182.0,4.0,2.0,45
1,01-05-2018,1579.714286,182.0,3.0,2.0,56
2,02-05-2018,1579.714286,439.0,3.5,0.0,45
3,03-05-2018,2439.0,53.0,5.0,1.0,23
4,04-05-2018,824.0,444.0,5.0,1.0,35


## Handling outliers

    - if skewness is in between -1 to +1 = no heavy outliers
    - if skewness > +1 = extremely high value outliers are present
        - if volume of data is high, proportion of outliers is less - drop the rows
        - otherwise - capping- replace outliers by nearest inliers
    - if skewness < -1 = extremely low value outliers are present
         - if volume of data is high, proportion of outliers is less - drop the rows
        - otherwise - capping- replace outliers by nearest inliers

In [46]:
df.skew()

Temperature    0.054894
Humidity      -0.582281
Pressure      -1.048203
Air Quality   -0.372134
Vibration      2.506968
dtype: float64

In [47]:
# drop the row having extreme high value outliers
print(df.shape)
df = df[df.Vibration<df.Vibration.quantile(0.95)]
print(df.shape)

(20, 6)
(19, 6)


In [48]:
df.skew()

Temperature    0.032589
Humidity      -0.498286
Pressure      -0.969746
Air Quality   -0.410217
Vibration      0.002301
dtype: float64