# Detecting and Replacing Outliers

In [23]:
import pandas as pd
import numpy as np
import os 

In [24]:
filename = os.path.join(os.getcwd(), "..", "..", "data", "censusData.csv")
df = pd.read_csv(filename, header=0)

###  Get the Dimensions of the Dataset

In [25]:
df.shape

(7000, 15)

### Glance at the Data

In [26]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
0,36,State-gov,112074,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Non-Female,0,0,45,United-States,<=50K
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K


## Step 1: Compute the n-th Percentile of a Given Column

In [27]:
hpw_999 = np.percentile(df['hours-per-week'], 99.9)
hpw_999

99.0

In [28]:
edu_90 = np.percentile(df['education-num'], 90.0)
edu_90

13.0

## Step 2: Add a Column With the Winsorized Version of the Original Column.

In [29]:
import scipy.stats as stats

In [30]:
df['education-num-win'] = stats.mstats.winsorize(df['education-num'], limits=[0.01, 0.01])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income,education-num-win
0,36,State-gov,112074,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Non-Female,0,0,45,United-States,<=50K,16
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K,9
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K,10
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K,10
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K,13


####  Calculate z-scores for all values of all (numeric) columns

In [31]:
df_zscores = df.select_dtypes(include=['number']).apply(stats.zscore)
df_zscores.head(10)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,education-num-win
0,-0.188926,-0.755763,2.305545,-0.153909,-0.212365,0.397049,2.322443
1,-0.261682,-1.503876,-0.406796,-0.153909,-0.212365,0.397049,-0.413127
2,-1.280263,0.7299,-0.019319,-0.153909,-0.212365,-1.956262,-0.022331
3,0.465876,-0.22192,-0.019319,-0.153909,-0.212365,-1.307073,-0.022331
4,0.029341,0.987647,1.143113,-0.153909,-0.212365,-0.008694,1.150056
5,-0.843728,0.385478,-0.019319,-0.153909,-0.212365,-1.631667,-0.022331
6,0.102097,0.263583,-0.406796,-0.153909,-0.212365,-0.008694,-0.413127
7,2.357526,-0.273195,-0.019319,-0.153909,-0.212365,-0.414438,-0.022331
8,-1.353018,0.00924,-0.019319,-0.153909,-0.212365,-0.008694,-0.022331
9,0.174853,-0.715163,-0.019319,-0.153909,-0.212365,-0.008694,-0.022331
