# Exploratory Data Analysis

In [1]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print(f"Successfully Downloaded {local}")

download("https://github.com/AllenDowney/ThinkStats/raw/v3/nb/thinkstats.py")

In [2]:
# An "empirical distribution" 
# describes the observed data's probability distribution
try:
    import empiricaldist
except ImportError:
    !pip install empiricaldist

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import HTML
from thinkstats import decorate

## 1. Evidence

In [4]:
download("https://github.com/AllenDowney/ThinkStats/raw/v3/data/2002FemPreg.dct")
download("https://github.com/AllenDowney/ThinkStats/raw/v3/data/2002FemPreg.dat.gz")

In [5]:
try:
    import statadict
except ImportError:
    !pip install statadict

In [6]:
dct_file = "2002FemPreg.dct"
dat_file = "2002FemPreg.dat.gz"

## 1.2 Reading the data

- Define a function that reads the from the above two files.
- The data format in these files is compatible with a statistical software package called stata

In [7]:
from statadict import parse_stata_dict

# read the data from the files ending with .dct extension
def read_stata(dct_file, dat_file):
    # read stata dictonary file
    stata_dict = parse_stata_dict(dct_file)

    # response
    resp = pd.read_fwf(
        dat_file,
        names=stata_dict.names,
        colspecs=stata_dict.colspecs,
        compression="gzip",
    )
    return resp

In [8]:
preg = read_stata(dct_file, dat_file)

- The above function returns a `DataFrame` which containes a row for each pregnancy reported by a respondant and a column for each variable

In [9]:
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,poverty_i,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw
0,1,1,,,,,6.0,,1.0,,...,0,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231
1,1,2,,,,,6.0,,1.0,,...,0,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231
3,2,2,,,,,6.0,,1.0,,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231
4,2,3,,,,,6.0,,1.0,,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231


In [10]:
preg.shape

(13593, 243)

- Variables in the dataframe as columns

In [11]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'poverty_i', 'laborfor_i', 'religion_i', 'metro_i', 'basewgt',
       'adj_mod_basewgt', 'finalwgt', 'secu_p', 'sest', 'cmintvw'],
      dtype='object', length=243)

- Access a particular data column

In [12]:
pregordr = preg["pregordr"]
type(pregordr)

pandas.core.series.Series

In [13]:
pregordr.head()

0    1
1    2
2    1
3    2
4    3
Name: pregordr, dtype: int64

#### Some of the varaibles(columns) from the preganancy dataset and their uses
- `caseid` : int id of the respondant.
- `pregordr`: pregnancy serial number: the code for a respondant(participator in a survey) first pregnancy is 1, for the second is 2 and so on.
- `preglngth`: int duration of the pregnancy in weeks.
- `outcome`: int code for the outcome of the pregnancy: 1 for live birth.
- `birthwgt_lb` and `birthwgt_oz`: pound and ounces birth weight of baby
- `ageprg`: mothers age
- `finalwgt`: statistical weight associated with the respondant. (float-value) 

## 1.4 Validation
Compute statistics and validate data based on the published table in the book.

In [14]:
preg["outcome"].head()

0    1
1    1
2    1
3    1
4    1
Name: outcome, dtype: int64

- The `value_counts` method counts the number of times each value appears, and `sort_index` which sorts the results according to the values in the `Index`

-  Check the total outcomes of pregnancy.

In [15]:
preg["outcome"].value_counts().sort_index()

outcome
1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: count, dtype: int64

- Compare brith weight with publish table.

In [16]:
# the dropna arg means, do not ignore values thar are NaN
counts = preg["birthwgt_lb"].value_counts(dropna=False).sort_index()
counts

birthwgt_lb
0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
51.0       1
97.0       1
98.0       1
99.0      57
NaN     4449
Name: count, dtype: int64

- Check the counts for the weight range from 0 to 5

In [17]:
counts.loc[0:5]

birthwgt_lb
0.0      8
1.0     40
2.0     53
3.0     98
4.0    229
5.0    697
Name: count, dtype: int64

In [18]:
counts.loc[0:5].sum()

np.int64(1125)

#### DATA CLEANING
- **Handle Missing Data**
The values 97, 98, 99 represent cases where the birth weight is unknown so replace these values with NaN. and also replace the 51 pounds as it is wrong. 

In [19]:
preg["birthwgt_lb"] = preg["birthwgt_lb"].replace([51, 97, 98, 99], np.nan)

## 1.5 Transformation
As a part of data cleaning, convert data into different formats, and perform other calculations.

- `agepreg` contains the mother's age at the end of the pregnancy. It is integer number for centiyears (hundredths fo a year). we can use `mean` to compute its' average.

In [20]:
preg["agepreg"].head()

0    3316.0
1    3925.0
2    1433.0
3    1783.0
4    1833.0
Name: agepreg, dtype: float64

In [21]:
preg["agepreg"].mean()

np.float64(2468.8151197039497)

- Convert the centi years to years by dividing them by 100

In [22]:
preg["agepreg"] /= 100.0
preg["agepreg"].mean()

np.float64(24.6881511970395)

- combine `birthwgt_lb` and `birthwgt_oz` into single column.

In [24]:
preg["birthwgt_oz"].value_counts(dropna=False).sort_index()

birthwgt_oz
0.0     1037
1.0      408
2.0      603
3.0      533
4.0      525
5.0      535
6.0      709
7.0      501
8.0      756
9.0      505
10.0     475
11.0     557
12.0     555
13.0     487
14.0     475
15.0     378
97.0       1
98.0       1
99.0      46
NaN     4506
Name: count, dtype: int64

- Replace empty data with NaN values

In [25]:
preg["birthwgt_oz"] = preg["birthwgt_oz"].replace([97, 98, 99], np.nan)

- Combine pounds and ounces in one column

In [28]:
preg["totalwgt_lb"] = preg["birthwgt_lb"] + preg["birthwgt_oz"] / 16.0
preg["totalwgt_lb"].mean()

np.float64(7.265628457623368)

## 1.6 Summary Statistics

A statistic is a number derived from a dataset, usually intended to quantify some aspect of the data. Examples include the count, mean, variance, and standard deviation.

- a series `count` method returns the number of values that are not NaN

In [30]:
weights = preg["totalwgt_lb"]
n = weights.count()
n

np.int64(9038)

- `sum` method that returns the sum of the values

In [37]:
weights.sum()

np.float64(65666.75)

In [39]:
mean = weights.mean()
mean

np.float64(7.265628457623368)

- Variance: How much population varies around it's mean

In [42]:
squared_deviations = (weights - mean) ** 2

In [43]:
variance = squared_deviation.sum() / n
variance

np.float64(1.983070989750022)

- Series provides `var` method to calculate variance.

In [45]:
weights.var() # by default it divides by n - 1

np.float64(1.9832904288326545)

In [46]:
weights.var(ddof=0)

np.float64(1.983070989750022)

- Standard Deviation

In [48]:
std = np.sqrt(variance)
std

np.float64(1.40821553384062)

In [49]:
weights.std(ddof=0)

np.float64(1.40821553384062)

## 1.7 Interpretation
The `query` method takes a string that can contain column names, comparison operators and numbers etc. and is used for selecting a row from dataframe based on certain condition.

In [51]:
subset = preg.query("caseid == 10229")
subset.shape

(7, 244)

In [54]:
subset

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
11093,10229,1,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11094,10229,2,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11095,10229,3,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11096,10229,4,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11097,10229,5,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11098,10229,6,,,,,1.0,,,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,
11099,10229,7,,,,,6.0,,1.0,,...,0,0,0,1914.323805,2021.999794,3369.662656,2,65,1232,7.6875


In [52]:
subset["outcome"].values

array([4, 4, 4, 4, 4, 4, 1])

In [55]:
weights.describe()

count    9038.000000
mean        7.265628
std         1.408293
min         0.125000
25%         6.500000
50%         7.375000
75%         8.125000
max        15.437500
Name: totalwgt_lb, dtype: float64

# 1.9 EXERCISES

## Exercise 1.1
Select the birthord column from preg, print the value counts, and compare to results published in the codebook at https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf.

In [57]:
preg["birthord"].head()

0    1.0
1    2.0
2    1.0
3    2.0
4    3.0
Name: birthord, dtype: float64

In [64]:
birthord_count = preg["birthord"].value_counts(dropna=False).sort_index()
birthord_count

birthord
1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
NaN     4445
Name: count, dtype: int64

## Exercise 1.2
Create a new column named totalwgt_kg that contains birth weight in kilograms (there are approximately 2.2 pounds per kilogram). Compute the mean and standard deviation of the new column.

In [67]:
preg["birthwgt_lb"].head()

0    8.0
1    7.0
2    9.0
3    7.0
4    6.0
Name: birthwgt_lb, dtype: float64

In [68]:
preg["totalwgt_kg"] = preg["birthwgt_lb"] / 2.2
preg["totalwgt_kg"].head()

0    3.636364
1    3.181818
2    4.090909
3    3.181818
4    2.727273
Name: totalwgt_kg, dtype: float64

In [69]:
total_weight = preg["totalwgt_kg"]
total_weight.count()

np.int64(9084)

In [70]:
total_weight.mean()

np.float64(3.1055101877426843)

In [71]:
total_weight.std(ddof=0)

np.float64(0.6415313591386952)

## Exercise 1.3
What are the pregnancy lengths for the respondent with caseid 2298?

What was the birth weight of the first baby born to the respondent with caseid 5013? Hint: You can use and to check more than one condition in a query.

In [84]:
subset2 = preg.query("caseid == 2298")
subset3 = preg.query("caseid == 5013")

In [85]:
subset2

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,1234,6.875,2.727273
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,1234,5.5,2.272727
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,1234,4.1875,1.818182
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,1234,6.875,2.727273


In [86]:
subset3

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
5516,5013,1,,,,,5.0,,1.0,,...,0,0,3643.044395,4548.148695,6132.268885,1,25,1231,7.375,3.181818
5517,5013,2,,,,,3.0,,,,...,0,0,3643.044395,4548.148695,6132.268885,1,25,1231,,
5518,5013,3,,,,,5.0,,1.0,,...,0,0,3643.044395,4548.148695,6132.268885,1,25,1231,8.3125,3.636364
5519,5013,4,,,,,5.0,,1.0,,...,0,0,3643.044395,4548.148695,6132.268885,1,25,1231,8.125,3.636364


In [79]:
subset2["prglngth"]

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

In [89]:
subset3["totalwgt_kg"].iloc[0]

np.float64(3.1818181818181817)