In [1]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
%matplotlib inline

In [2]:
# get sample data from sklearn
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
# read a brief summary about the boston dataset
print("keys:",boston.keys())
print("shape:",boston.data.shape)
print("feature names:",boston.feature_names)
print("Description:",boston.DESCR)

keys: dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
shape: (506, 13)
feature names: ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Description: .. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        

In [12]:
# converting to data frames
bos = pd.DataFrame(boston.data) # create the data frame
bos.columns = boston.feature_names # label columns
bos['PRICE'] = boston.target # Create price column

In [13]:
# performing EDA using pandas
bos.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [11]:
# performing EDA using pandas-profiling
profile = pandas_profiling.ProfileReport(bos)
profile

0,1
Number of variables,14
Number of observations,506
Total Missing (%),0.0%
Total size in memory,55.4 KiB
Average record size in memory,112.2 B

0,1
Numeric,12
Categorical,0
Boolean,1
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,356
Unique (%),70.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,68.575
Minimum,2.9
Maximum,100
Zeros (%),0.0%

0,1
Minimum,2.9
5-th percentile,17.725
Q1,45.025
Median,77.5
Q3,94.075
95-th percentile,100.0
Maximum,100.0
Range,97.1
Interquartile range,49.05

0,1
Standard deviation,28.149
Coef of variation,0.41048
Kurtosis,-0.96772
Mean,68.575
MAD,24.611
Skewness,-0.59896
Sum,34699
Variance,792.36
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
100.0,43,8.5%,
96.0,4,0.8%,
98.2,4,0.8%,
95.4,4,0.8%,
97.9,4,0.8%,
87.9,4,0.8%,
98.8,4,0.8%,
94.1,3,0.6%,
88.0,3,0.6%,
21.4,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
2.9,1,0.2%,
6.0,1,0.2%,
6.2,1,0.2%,
6.5,1,0.2%,
6.6,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
98.8,4,0.8%,
98.9,3,0.6%,
99.1,1,0.2%,
99.3,1,0.2%,
100.0,43,8.5%,

0,1
Distinct count,357
Unique (%),70.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,356.67
Minimum,0.32
Maximum,396.9
Zeros (%),0.0%

0,1
Minimum,0.32
5-th percentile,84.59
Q1,375.38
Median,391.44
Q3,396.23
95-th percentile,396.9
Maximum,396.9
Range,396.58
Interquartile range,20.848

0,1
Standard deviation,91.295
Coef of variation,0.25596
Kurtosis,7.2268
Mean,356.67
MAD,54.629
Skewness,-2.8904
Sum,180480
Variance,8334.8
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
396.9,121,23.9%,
395.24,3,0.6%,
393.74,3,0.6%,
393.23,2,0.4%,
394.72,2,0.4%,
396.21,2,0.4%,
395.69,2,0.4%,
396.06,2,0.4%,
395.63,2,0.4%,
395.6,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0.32,1,0.2%,
2.52,1,0.2%,
2.6,1,0.2%,
3.5,1,0.2%,
3.65,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
396.28,1,0.2%,
396.3,1,0.2%,
396.33,1,0.2%,
396.42,1,0.2%,
396.9,121,23.9%,

0,1
Distinct count,2
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.06917

0,1
0.0,471
1.0,35

Value,Count,Frequency (%),Unnamed: 3
0.0,471,93.1%,
1.0,35,6.9%,

0,1
Distinct count,504
Unique (%),99.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.6135
Minimum,0.00632
Maximum,88.976
Zeros (%),0.0%

0,1
Minimum,0.00632
5-th percentile,0.02791
Q1,0.082045
Median,0.25651
Q3,3.6771
95-th percentile,15.789
Maximum,88.976
Range,88.97
Interquartile range,3.595

0,1
Standard deviation,8.6015
Coef of variation,2.3804
Kurtosis,37.131
Mean,3.6135
MAD,4.7841
Skewness,5.2231
Sum,1828.4
Variance,73.987
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
14.3337,2,0.4%,
0.01501,2,0.4%,
0.08265,1,0.2%,
0.537,1,0.2%,
1.35472,1,0.2%,
0.14103,1,0.2%,
0.03502,1,0.2%,
0.03615,1,0.2%,
0.66351,1,0.2%,
0.1265,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.00632,1,0.2%,
0.00906,1,0.2%,
0.01096,1,0.2%,
0.01301,1,0.2%,
0.01311,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
45.7461,1,0.2%,
51.1358,1,0.2%,
67.9208,1,0.2%,
73.5341,1,0.2%,
88.9762,1,0.2%,

0,1
Distinct count,412
Unique (%),81.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.795
Minimum,1.1296
Maximum,12.127
Zeros (%),0.0%

0,1
Minimum,1.1296
5-th percentile,1.462
Q1,2.1002
Median,3.2074
Q3,5.1884
95-th percentile,7.8278
Maximum,12.127
Range,10.997
Interquartile range,3.0883

0,1
Standard deviation,2.1057
Coef of variation,0.55486
Kurtosis,0.48794
Mean,3.795
MAD,1.7194
Skewness,1.0118
Sum,1920.3
Variance,4.434
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
3.4952,5,1.0%,
5.2873,4,0.8%,
5.4007,4,0.8%,
5.7209,4,0.8%,
6.8147,4,0.8%,
3.6519,3,0.6%,
7.3172,3,0.6%,
5.4917,3,0.6%,
7.8278,3,0.6%,
5.4159,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
1.1296,1,0.2%,
1.137,1,0.2%,
1.1691,1,0.2%,
1.1742,1,0.2%,
1.1781,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
9.2203,2,0.4%,
9.2229,1,0.2%,
10.5857,2,0.4%,
10.7103,2,0.4%,
12.1265,1,0.2%,

0,1
Distinct count,76
Unique (%),15.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,11.137
Minimum,0.46
Maximum,27.74
Zeros (%),0.0%

0,1
Minimum,0.46
5-th percentile,2.18
Q1,5.19
Median,9.69
Q3,18.1
95-th percentile,21.89
Maximum,27.74
Range,27.28
Interquartile range,12.91

0,1
Standard deviation,6.8604
Coef of variation,0.61601
Kurtosis,-1.2335
Mean,11.137
MAD,6.202
Skewness,0.29502
Sum,5635.2
Variance,47.064
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
18.1,132,26.1%,
19.58,30,5.9%,
8.14,22,4.3%,
6.2,18,3.6%,
21.89,15,3.0%,
9.9,12,2.4%,
3.97,12,2.4%,
8.56,11,2.2%,
10.59,11,2.2%,
5.86,10,2.0%,

Value,Count,Frequency (%),Unnamed: 3
0.46,1,0.2%,
0.74,1,0.2%,
1.21,1,0.2%,
1.22,1,0.2%,
1.25,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
18.1,132,26.1%,
19.58,30,5.9%,
21.89,15,3.0%,
25.65,7,1.4%,
27.74,5,1.0%,

0,1
Distinct count,455
Unique (%),89.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,12.653
Minimum,1.73
Maximum,37.97
Zeros (%),0.0%

0,1
Minimum,1.73
5-th percentile,3.7075
Q1,6.95
Median,11.36
Q3,16.955
95-th percentile,26.808
Maximum,37.97
Range,36.24
Interquartile range,10.005

0,1
Standard deviation,7.1411
Coef of variation,0.56437
Kurtosis,0.49324
Mean,12.653
MAD,5.7153
Skewness,0.90646
Sum,6402.5
Variance,50.995
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
14.1,3,0.6%,
6.36,3,0.6%,
18.13,3,0.6%,
8.05,3,0.6%,
7.79,3,0.6%,
9.5,2,0.4%,
4.59,2,0.4%,
3.76,2,0.4%,
17.27,2,0.4%,
10.11,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
1.73,1,0.2%,
1.92,1,0.2%,
1.98,1,0.2%,
2.47,1,0.2%,
2.87,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
34.37,1,0.2%,
34.41,1,0.2%,
34.77,1,0.2%,
36.98,1,0.2%,
37.97,1,0.2%,

0,1
Distinct count,81
Unique (%),16.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.5547
Minimum,0.385
Maximum,0.871
Zeros (%),0.0%

0,1
Minimum,0.385
5-th percentile,0.40925
Q1,0.449
Median,0.538
Q3,0.624
95-th percentile,0.74
Maximum,0.871
Range,0.486
Interquartile range,0.175

0,1
Standard deviation,0.11588
Coef of variation,0.2089
Kurtosis,-0.064667
Mean,0.5547
MAD,0.095695
Skewness,0.72931
Sum,280.68
Variance,0.013428
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0.538,23,4.5%,
0.713,18,3.6%,
0.437,17,3.4%,
0.871,16,3.2%,
0.489,15,3.0%,
0.624,15,3.0%,
0.693,14,2.8%,
0.605,14,2.8%,
0.74,13,2.6%,
0.544,12,2.4%,

Value,Count,Frequency (%),Unnamed: 3
0.385,1,0.2%,
0.389,1,0.2%,
0.392,2,0.4%,
0.394,1,0.2%,
0.398,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0.713,18,3.6%,
0.718,6,1.2%,
0.74,13,2.6%,
0.77,8,1.6%,
0.871,16,3.2%,

0,1
Distinct count,229
Unique (%),45.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,22.533
Minimum,5
Maximum,50
Zeros (%),0.0%

0,1
Minimum,5.0
5-th percentile,10.2
Q1,17.025
Median,21.2
Q3,25.0
95-th percentile,43.4
Maximum,50.0
Range,45.0
Interquartile range,7.975

0,1
Standard deviation,9.1971
Coef of variation,0.40817
Kurtosis,1.4952
Mean,22.533
MAD,6.6472
Skewness,1.1081
Sum,11402
Variance,84.587
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
50.0,16,3.2%,
25.0,8,1.6%,
23.1,7,1.4%,
21.7,7,1.4%,
22.0,7,1.4%,
20.6,6,1.2%,
19.4,6,1.2%,
20.1,5,1.0%,
19.6,5,1.0%,
19.3,5,1.0%,

Value,Count,Frequency (%),Unnamed: 3
5.0,2,0.4%,
5.6,1,0.2%,
6.3,1,0.2%,
7.0,2,0.4%,
7.2,3,0.6%,

Value,Count,Frequency (%),Unnamed: 3
46.7,1,0.2%,
48.3,1,0.2%,
48.5,1,0.2%,
48.8,1,0.2%,
50.0,16,3.2%,

0,1
Distinct count,46
Unique (%),9.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,18.456
Minimum,12.6
Maximum,22
Zeros (%),0.0%

0,1
Minimum,12.6
5-th percentile,14.7
Q1,17.4
Median,19.05
Q3,20.2
95-th percentile,21.0
Maximum,22.0
Range,9.4
Interquartile range,2.8

0,1
Standard deviation,2.1649
Coef of variation,0.11731
Kurtosis,-0.28509
Mean,18.456
MAD,1.7873
Skewness,-0.80232
Sum,9338.5
Variance,4.687
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
20.2,140,27.7%,
14.7,34,6.7%,
21.0,27,5.3%,
17.8,23,4.5%,
19.2,19,3.8%,
17.4,18,3.6%,
18.6,17,3.4%,
19.1,17,3.4%,
16.6,16,3.2%,
18.4,16,3.2%,

Value,Count,Frequency (%),Unnamed: 3
12.6,3,0.6%,
13.0,12,2.4%,
13.6,1,0.2%,
14.4,1,0.2%,
14.7,34,6.7%,

Value,Count,Frequency (%),Unnamed: 3
20.9,11,2.2%,
21.0,27,5.3%,
21.1,1,0.2%,
21.2,15,3.0%,
22.0,2,0.4%,

0,1
Distinct count,9
Unique (%),1.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,9.5494
Minimum,1
Maximum,24
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,2
Q1,4
Median,5
Q3,24
95-th percentile,24
Maximum,24
Range,23
Interquartile range,20

0,1
Standard deviation,8.7073
Coef of variation,0.91181
Kurtosis,-0.86723
Mean,9.5494
MAD,7.5394
Skewness,1.0048
Sum,4832
Variance,75.816
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
24.0,132,26.1%,
5.0,115,22.7%,
4.0,110,21.7%,
3.0,38,7.5%,
6.0,26,5.1%,
8.0,24,4.7%,
2.0,24,4.7%,
1.0,20,4.0%,
7.0,17,3.4%,

Value,Count,Frequency (%),Unnamed: 3
1.0,20,4.0%,
2.0,24,4.7%,
3.0,38,7.5%,
4.0,110,21.7%,
5.0,115,22.7%,

Value,Count,Frequency (%),Unnamed: 3
5.0,115,22.7%,
6.0,26,5.1%,
7.0,17,3.4%,
8.0,24,4.7%,
24.0,132,26.1%,

0,1
Distinct count,446
Unique (%),88.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.2846
Minimum,3.561
Maximum,8.78
Zeros (%),0.0%

0,1
Minimum,3.561
5-th percentile,5.314
Q1,5.8855
Median,6.2085
Q3,6.6235
95-th percentile,7.5875
Maximum,8.78
Range,5.219
Interquartile range,0.738

0,1
Standard deviation,0.70262
Coef of variation,0.1118
Kurtosis,1.8915
Mean,6.2846
MAD,0.51329
Skewness,0.40361
Sum,3180
Variance,0.49367
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
6.167,3,0.6%,
6.229,3,0.6%,
6.127,3,0.6%,
5.713,3,0.6%,
6.417,3,0.6%,
6.405,3,0.6%,
6.38,2,0.4%,
5.304,2,0.4%,
5.983,2,0.4%,
7.185,2,0.4%,

Value,Count,Frequency (%),Unnamed: 3
3.561,1,0.2%,
3.863,1,0.2%,
4.138,2,0.4%,
4.368,1,0.2%,
4.519,1,0.2%,

Value,Count,Frequency (%),Unnamed: 3
8.375,1,0.2%,
8.398,1,0.2%,
8.704,1,0.2%,
8.725,1,0.2%,
8.78,1,0.2%,

0,1
Correlation,0.91023

0,1
Distinct count,26
Unique (%),5.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,11.364
Minimum,0
Maximum,100
Zeros (%),73.5%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,12.5
95-th percentile,80.0
Maximum,100.0
Range,100.0
Interquartile range,12.5

0,1
Standard deviation,23.322
Coef of variation,2.0524
Kurtosis,4.0315
Mean,11.364
MAD,16.709
Skewness,2.2257
Sum,5750
Variance,543.94
Memory size,4.0 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,372,73.5%,
20.0,21,4.2%,
80.0,15,3.0%,
12.5,10,2.0%,
22.0,10,2.0%,
25.0,10,2.0%,
40.0,7,1.4%,
45.0,6,1.2%,
30.0,6,1.2%,
90.0,5,1.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,372,73.5%,
12.5,10,2.0%,
17.5,1,0.2%,
18.0,1,0.2%,
20.0,21,4.2%,

Value,Count,Frequency (%),Unnamed: 3
82.5,2,0.4%,
85.0,2,0.4%,
90.0,5,1.0%,
95.0,4,0.8%,
100.0,1,0.2%,

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
