# 9. Data Cleansing

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
PATH_DATA = '../mydata/employee_data/'

## Get data

In [3]:
df_unclean = pd.read_csv(PATH_DATA+'unclean_data.csv')
df_unclean.head()

Unnamed: 0,x_id,date_day,x_value
0,134.0,2016-12-23,3.0
1,217.0,2017-02-20,2.0
2,173.0,2016-11-18,2.4
3,65.0,2016-04-21,3.0
4,102.0,2016-12-07,2.75


In [4]:
df_unclean0 = df_unclean.copy()

# Calculate and Filter missing data

In [5]:
## null count
df_unclean.isnull().sum()

x_id         356
date_day     100
x_value     9398
dtype: int64

In [6]:
## not null count
df_unclean.count()

x_id        139644
date_day    139900
x_value     130602
dtype: int64

In [7]:
df_unclean[df_unclean['x_id'].isnull()]

Unnamed: 0,x_id,date_day,x_value
260,,2016-08-11,4.0
1283,,2016-09-10,2.0
1379,,2016-05-06,3.5
1491,,2016-12-22,4.0
2204,,2017-02-22,2.0
...,...,...,...
138601,,2016-07-31,4.0
138667,,2016-09-23,3.0
139220,,2016-09-12,4.0
139407,,2016-10-30,4.0


In [8]:
df_unclean[~(df_unclean['x_id'].isnull())]

Unnamed: 0,x_id,date_day,x_value
0,134.0,2016-12-23,3.00
1,217.0,2017-02-20,2.00
2,173.0,2016-11-18,2.40
3,65.0,2016-04-21,3.00
4,102.0,2016-12-07,2.75
...,...,...,...
139995,43.0,2016-10-17,3.00
139996,663.0,2016-10-07,2.00
139997,143.0,2015-06-07,1.00
139998,130.0,2016-07-18,3.00


In [9]:
## drop all null from all columns
df_unclear2 = df_unclean.dropna()

In [10]:
df_unclear2.isnull().sum()

x_id        0
date_day    0
x_value     0
dtype: int64

# Data Imputation

In [11]:
df_unclean

Unnamed: 0,x_id,date_day,x_value
0,134.0,2016-12-23,3.00
1,217.0,2017-02-20,2.00
2,173.0,2016-11-18,2.40
3,65.0,2016-04-21,3.00
4,102.0,2016-12-07,2.75
...,...,...,...
139995,43.0,2016-10-17,3.00
139996,663.0,2016-10-07,2.00
139997,143.0,2015-06-07,1.00
139998,130.0,2016-07-18,3.00


In [12]:
# df_unclean['x_value'] = df_unclean['x_value'].fillna(df_unclean['x_id'])
df_unclean['x_value'] = df_unclean['x_value'].fillna(-1)

In [13]:
df_unclean.isnull().sum()

x_id        356
date_day    100
x_value       0
dtype: int64

# Forward and backwards filling of missing values

In [14]:
df_unclean = df_unclean0.copy()

In [15]:
df_unclean[df_unclean.x_value.isnull()].x_id.value_counts()

38.0     74
194.0    60
219.0    59
101.0    58
226.0    58
         ..
333.0     1
429.0     1
306.0     1
396.0     1
495.0     1
Name: x_id, Length: 505, dtype: int64

In [16]:
df_unclean_3 = df_unclean[(df_unclean['x_id'] == 59) & 
                          (~(df_unclean['date_day'].isnull()))]

In [17]:
df_unclean_3 = df_unclean_3.sort_values('date_day')
df_unclean_3.tail(10)

Unnamed: 0,x_id,date_day,x_value
57213,59.0,2017-03-09,2.25
91293,59.0,2017-03-10,2.25
9542,59.0,2017-03-11,2.5
91537,59.0,2017-03-12,4.0
60896,59.0,2017-03-13,
7260,59.0,2017-03-15,2.0
122862,59.0,2017-03-16,3.0
28334,59.0,2017-03-16,3.0
49441,59.0,2017-03-18,
105380,59.0,2017-03-18,


In [18]:
df_unclean_3.isnull().sum()

x_id         0
date_day     0
x_value     24
dtype: int64

In [19]:
df_unclean_3['x_value'].fillna(method="ffill").tail(10)

57213     2.25
91293     2.25
9542      2.50
91537     4.00
60896     4.00
7260      2.00
122862    3.00
28334     3.00
49441     3.00
105380    3.00
Name: x_value, dtype: float64

In [20]:
df_unclean_3['x_value'].fillna(method="bfill").tail(10)

57213     2.25
91293     2.25
9542      2.50
91537     4.00
60896     2.00
7260      2.00
122862    3.00
28334     3.00
49441      NaN
105380     NaN
Name: x_value, dtype: float64

## Duplicate

In [21]:
df_unclean

Unnamed: 0,x_id,date_day,x_value
0,134.0,2016-12-23,3.00
1,217.0,2017-02-20,2.00
2,173.0,2016-11-18,2.40
3,65.0,2016-04-21,3.00
4,102.0,2016-12-07,2.75
...,...,...,...
139995,43.0,2016-10-17,3.00
139996,663.0,2016-10-07,2.00
139997,143.0,2015-06-07,1.00
139998,130.0,2016-07-18,3.00


In [22]:
df_unclean.drop_duplicates()

Unnamed: 0,x_id,date_day,x_value
0,134.0,2016-12-23,3.000000
1,217.0,2017-02-20,2.000000
2,173.0,2016-11-18,2.400000
3,65.0,2016-04-21,3.000000
4,102.0,2016-12-07,2.750000
...,...,...,...
139982,190.0,2017-02-08,2.333333
139983,225.0,2016-09-18,3.500000
139991,827.0,2016-11-08,2.000000
139996,663.0,2016-10-07,2.000000


In [23]:
df_unclean_unique = df_unclean.drop_duplicates()

## Outliers

simple outliers formula

In [24]:
df_unclean_unique.isnull().sum()

x_id         240
date_day      73
x_value     6848
dtype: int64

In [25]:
df_unclean_unique_null = df_unclean_unique[df_unclean_unique.x_value.isnull()]
df_unclean_unique_nonnull = df_unclean_unique[~(df_unclean_unique.x_value.isnull())]

In [26]:
Mx = df_unclean_unique_nonnull['x_value'].mean()
Sx = df_unclean_unique_nonnull['x_value'].std()

In [27]:
Mx-(2*Sx), Mx+(2*Sx)

(1.00757346810413, 4.643254477692857)

In [28]:
def is_outlier(x):
    if (x >= Mx-(2*Sx)) & (x <= Mx+(2*Sx)) :
        return False
    else:
        return True

In [29]:
df_unclean_unique_nonnull['x_value'].apply(is_outlier)

0         False
1         False
2         False
3         False
4         False
          ...  
139982    False
139983    False
139991    False
139996    False
139997     True
Name: x_value, Length: 95330, dtype: bool

In [30]:
df_unclean_unique_nonnull['detect_outlier'] = df_unclean_unique_nonnull['x_value'].apply(is_outlier)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
df_unclean_unique_nonnull[df_unclean_unique_nonnull['detect_outlier'] == 1]

Unnamed: 0,x_id,date_day,x_value,detect_outlier
8,205.0,2016-05-23,1.0,True
9,161.0,2016-04-28,1.0,True
10,862.0,2016-12-16,1.0,True
13,293.0,2016-07-29,1.0,True
26,2.0,2015-08-26,1.0,True
...,...,...,...,...
139853,825.0,2016-05-21,1.0,True
139866,418.0,2017-02-25,1.0,True
139933,375.0,2016-09-19,1.0,True
139958,871.0,2017-03-16,1.0,True


In [32]:
## Handling Outlier (one options)
max_acceptable_value = Mx+(2*Sx)
min_acceptable_value = Mx-(2*Sx)
min_acceptable_value,max_acceptable_value

(1.00757346810413, 4.643254477692857)

In [33]:
df_unclean_unique_nonnull['x_value_handle'] = np.where(df_unclean_unique_nonnull['x_value'] > max_acceptable_value, max_acceptable_value,
         np.where(df_unclean_unique_nonnull['x_value'] < min_acceptable_value, min_acceptable_value,
                 df_unclean_unique_nonnull['x_value']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
df_unclean_unique_nonnull[df_unclean_unique_nonnull['detect_outlier'] == 1]

Unnamed: 0,x_id,date_day,x_value,detect_outlier,x_value_handle
8,205.0,2016-05-23,1.0,True,1.007573
9,161.0,2016-04-28,1.0,True,1.007573
10,862.0,2016-12-16,1.0,True,1.007573
13,293.0,2016-07-29,1.0,True,1.007573
26,2.0,2015-08-26,1.0,True,1.007573
...,...,...,...,...,...
139853,825.0,2016-05-21,1.0,True,1.007573
139866,418.0,2017-02-25,1.0,True,1.007573
139933,375.0,2016-09-19,1.0,True,1.007573
139958,871.0,2017-03-16,1.0,True,1.007573
