# Exploratory Data Analysis¶
Demo 1: Detecting and Removing Outliers
In this demo, you will be shown how to detect and remove outliers using Z-score and IQR score.

In [1]:
#Step1: Import the required libraries
import pandas as pd
from sklearn import datasets
from scipy import stats
import numpy as np

In [2]:
#Step2: Load the Boston House Pricing Dataset which is included in the sklearn dataset API
from sklearn.datasets import load_boston
boston = load_boston()
x = boston.data
y = boston.target
columns = boston.feature_names

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [3]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
boston = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
x = np.hstack([boston.values[::2, :], boston.values[1::2, :2]])
y = boston.values[1::2, 2]

boston


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [14]:
#Step3: Create the dataframe
boston_df = pd.DataFrame(boston)
boston_df.columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO']
boston_df = boston_df.fillna(value=0)
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3
1,396.9,4.98,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.9,9.14,21.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8


# Using Z-Score


In [15]:
#Step1: Use Z-score function defined in scipy library to detect the outliers
boston_df_z = boston_df
z = np.abs(stats.zscore(boston_df))
print(z)

          CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
0     0.957975  0.347499  1.466008  0.189272  0.901366  1.079028  0.779908   
1     1.152714  0.407625  0.723195  0.189272  0.959099  0.987754  0.865055   
2     0.957863  0.696451  0.985574  0.189272  0.662756  1.030619  1.125552   
3     1.152714  0.166356  0.480959  0.189272  0.959099  0.987754  0.865055   
4     0.957863  0.696451  0.985574  0.189272  0.662756  1.270775  0.676467   
...        ...       ...       ...       ...       ...       ...       ...   
1007  1.152714  0.369347  0.713101  0.189272  0.959099  0.987754  0.865055   
1008  0.957426  0.696451  0.495048  0.189272  1.022400  1.147868  1.387938   
1009  1.134367  0.320629  0.521332  0.189272  0.959099  0.987754  0.865055   
1010  0.957756  0.696451  0.495048  0.189272  1.022400  0.907713  1.173488   
1011  1.152714  0.239433  0.498076  0.189272  0.959099  0.987754  0.865055   

           DIS       RAD       TAX   PTRATIO  
0     0.909342  

In [None]:
#Looking at the code and the output above, it is difficult to say which data point is an outlier. So let’s define a threshold to identify an outlier.

In [16]:
#Step2: Define a threshold
threshold = 3
print(np.where(z > threshold))

(array([ 78,  80, 108, 110, 112, 112, 114, 128, 130, 132, 284, 304, 308,
       310, 320, 323, 324, 325, 326, 327, 333, 373, 390, 391, 392, 394,
       396, 398, 400, 402, 404, 406, 407, 408, 409, 416, 418, 420, 422,
       424, 432, 436, 438, 440, 442, 444, 451, 457, 467, 468, 472, 508,
       508, 510, 510, 512, 515, 525, 535, 538, 546, 548, 552, 554, 564,
       566, 566, 567, 568, 572, 580, 582, 584, 596, 598, 600, 694, 696,
       702, 704, 706, 706, 708, 708, 710, 710, 712, 714, 716, 726, 728,
       737, 738, 739, 740, 741, 743, 744, 745], dtype=int64), array([1, 1, 1, 1, 1, 7, 1, 7, 1, 1, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 2, 2,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 2, 2, 3, 3, 1, 7, 1, 7, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1,
       3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 7, 1, 7, 1, 7, 1, 7, 3, 3,
       3, 3, 3, 2, 3, 2, 3, 2, 2, 3, 2], dtype=int64))


In [None]:
#The first array contains the list of row numbers and second array contains the respective column numbers, which means that z[55][1] has a z-score higher than 3.

In [17]:
#Step3: Remove the outliers using the z-score
boston_df_z = boston_df_z[(z < threshold).all(axis=1)]

print("The no. of rows before outlier filtering was: ", boston_df.shape)
print("The no. of rows after outlier filtering is: ", boston_df_z.shape)

The no. of rows before outlier filtering was:  (1012, 11)
The no. of rows after outlier filtering is:  (920, 11)


# Using IQR Score

In [18]:
#Step1: Calculate the IQR
boston_df_iqr = boston_df
Q1 = boston_df_iqr.quantile(0.25)
Q3 = boston_df_iqr.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

CRIM       391.177170
ZN          16.780000
INDUS       13.515000
CHAS         0.000000
NOX          0.538000
RM           6.208250
AGE         77.400000
DIS          3.203325
RAD          5.000000
TAX        330.000000
PTRATIO     19.025000
dtype: float64


In [19]:
#Step2: Detect the outliers
print(boston_df_iqr < (Q1 - 1.5 * IQR)) |(boston_df_iqr > (Q3 + 1.5 * IQR))

       CRIM     ZN  INDUS   CHAS    NOX     RM    AGE    DIS    RAD    TAX  \
0     False  False  False  False  False  False  False  False  False  False   
1     False  False  False  False  False  False  False  False  False  False   
2     False  False  False  False  False  False  False  False  False  False   
3     False  False  False  False  False  False  False  False  False  False   
4     False  False  False  False  False  False  False  False  False  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
1007  False  False  False  False  False  False  False  False  False  False   
1008  False  False  False  False  False  False  False  False  False  False   
1009  False  False  False  False  False  False  False  False  False  False   
1010  False  False  False  False  False  False  False  False  False  False   
1011  False  False  False  False  False  False  False  False  False  False   

      PTRATIO  
0       False  
1       False  
2       False  

TypeError: Cannot perform 'ror_' with a dtyped [bool] array and scalar of type [NoneType]

In [20]:
#The data point where we have False means that these values are valid whereas True indicates presence of an outlier.

In [21]:
#Step3: Remove the outliers using the IQR score
boston_df_out = boston_df_iqr[~((boston_df_iqr < (Q1 - 1.5 * IQR)) |(boston_df_iqr > (Q3 + 1.5 * IQR))).any(axis=1)]

print("The no. of rows before outlier filtering was: ", boston_df_iqr.shape)
print("The no. of rows after outlier filtering is: ", boston_df_out.shape)

The no. of rows before outlier filtering was:  (1012, 11)
The no. of rows after outlier filtering is:  (765, 11)


In [22]:
#Hence, the outliers have been removed.