# Get Summary Statistics Using Pandas describe() Method

In [76]:
import pandas as pd
import numpy as np
import os 

### Load the Dataset 

In [77]:
filename = os.path.join(os.getcwd(), "..", "..", "data", "censusData.csv")
df = pd.read_csv(filename, header=0)

### Glance at the Dataset


In [78]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
0,36,State-gov,112074,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Non-Female,0,0,45,United-States,<=50K
1,35,Private,32528,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Non-Female,0,0,45,United-States,<=50K
2,21,Private,270043,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,16,United-States,<=50K
3,45,Private,168837,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,24,Canada,>50K
4,39,Private,297449,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,0,0,40,United-States,>50K


### Get the Dimensions of the Dataset

In [79]:
df.shape

(7000, 15)

## Step 1: Compute Summary Statistics Using Pandas `describe()` Method

In [80]:
df_summ = df.describe()
df_summ

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,38.596714,192433.5,10.049857,1079.000429,84.970286,40.107143
std,13.745594,106336.5,2.580982,7011.160679,400.142351,12.323946
min,17.0,18827.0,1.0,0.0,0.0,1.0
25%,28.0,120247.8,9.0,0.0,0.0,40.0
50%,37.0,182117.0,10.0,0.0,0.0,40.0
75%,47.0,240237.0,12.0,0.0,0.0,45.0
max,90.0,1268339.0,16.0,99999.0,4356.0,99.0


In [81]:
df_summ_all = df.describe(include = 'all')
df_summ_all

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex_selfID,capital-gain,capital-loss,hours-per-week,native-country,income
count,7000.0,6625,7000.0,7000,7000.0,7000,6625,7000,7000,7000,7000.0,7000.0,7000.0,6862,7000
unique,,7,,16,,7,14,6,5,2,,,,40,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Non-Female,,,,United-States,<=50K
freq,,4879,,2263,,3277,911,2878,5990,4731,,,,6233,5319
mean,38.596714,,192433.5,,10.049857,,,,,,1079.000429,84.970286,40.107143,,
std,13.745594,,106336.5,,2.580982,,,,,,7011.160679,400.142351,12.323946,,
min,17.0,,18827.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,120247.8,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,182117.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,47.0,,240237.0,,12.0,,,,,,0.0,0.0,45.0,,


In [82]:
describe_vars = ['age', 'education-num', 'hours-per-week']
df_summ_selected = df[describe_vars].describe()
df_summ_selected

Unnamed: 0,age,education-num,hours-per-week
count,7000.0,7000.0,7000.0
mean,38.596714,10.049857,40.107143
std,13.745594,2.580982,12.323946
min,17.0,1.0,1.0
25%,28.0,9.0,40.0
50%,37.0,10.0,40.0
75%,47.0,12.0,45.0
max,90.0,16.0,99.0


## Step 2: Data Analytics Using Summary Statistics

In [83]:
df_summ

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,38.596714,192433.5,10.049857,1079.000429,84.970286,40.107143
std,13.745594,106336.5,2.580982,7011.160679,400.142351,12.323946
min,17.0,18827.0,1.0,0.0,0.0,1.0
25%,28.0,120247.8,9.0,0.0,0.0,40.0
50%,37.0,182117.0,10.0,0.0,0.0,40.0
75%,47.0,240237.0,12.0,0.0,0.0,45.0
max,90.0,1268339.0,16.0,99999.0,4356.0,99.0


### What is the 25th percentile of feature 'age'?

In [84]:
age_25p = df_summ.loc['25%']['age']
print(f"The 25th percentile of the feature 'age' is {age_25p}")

The 25th percentile of the feature 'age' is 28.0


### Which feature has the most variation?

In [85]:
df_summ.loc['std'].idxmax()

'fnlwgt'

In [86]:
df_summ.idxmax(axis = 1)['std']

'fnlwgt'

In [87]:
column_name = df_summ.loc['mean'].idxmax()
column_name

'fnlwgt'

### Do any features have negative values?

In [88]:
np.any(df_summ.loc['min'] < 0)

False

###  Which feature has the highest range?

In [89]:
column_ranges = df_summ.loc['max'] - df_summ.loc['min']
column_range_name = column_ranges.idxmax()
column_range_name

'fnlwgt'