# American Time Use Survey

[Data and documentation](https://www.bls.gov/tus/data.htm)

In [None]:
import pandas as pd
import numpy as np
import requests
import io
import matplotlib.pyplot as plt
from dateutil.parser import parse
import datetime as dt
from zipfile import ZipFile
import os

In [None]:
files = ['sum','resp'] # ["resp", "rost", "sum", "act", "cps", "who"]

ATUS = dict()

# multiyear data
# Zip files have URLs like https://www.bls.gov/tus/datafiles/atusrost-0322.zip
# Above pattern can change year to year. Update accordingly.

base_url = 'https://www.bls.gov/tus/datafiles/atus'
ending = f'-03{(dt.date.today() - dt.timedelta(days = 399)).strftime("%y")}.zip'


# Download data, read to pandas, and delete local files
for file in files:
    url = f'{base_url}{file}{ending}'
    r = requests.get(url)
    
    # Try to get data directly from web 
    try:
        # this doesn't work Aug '23 because BLS blocks bots
        z = ZipFile(io.BytesIO(r.content))
        z.extractall()
        ATUS[file] = pd.read_csv(z.open(ending))
    
    # Download files locally then delete
    except:
        print(url, 'click to download')
        input('Confirm Download')
        
        os.system(f'cp ~/Downloads/atus{file}{ending} {os.getcwd()}')
        with ZipFile(f'atus{file}{ending}', 'r') as z:
            z.extractall()
            dat = ending.replace(".zip",'.dat').replace("-",'_')
            ATUS[file] = pd.read_csv(z.open(f'atus{file}{dat}'))
            
            # delete files
            files = !ls
            for f in files:
                if f.startswith(f'atus{file}') or f.endswith("_info.txt"):
                    os.system(f"rm {f}")

In [None]:
dfs = ATUS['sum']


# Sample from each year

In [None]:
df_sample = pd.DataFrame()

for year in range(2003, 2023):
    
    condition = dfs.TUCASEID.astype(str).str[0:4] == str(year)
    
    small = dfs[condition].sample(n = 200)
    
    df_sample = pd.concat([df_sample, small])

In [None]:
%cd ~/Stats1101e/Histograms

# Make a lot of histograms

In [None]:
sleep = df_sample.t010101 / 60

nums = np.linspace(0, sleep.max(), 100)

n = 50
for i in range(n):
    
    if i < 50/3:
        size = 10
    elif i < 50*.66:
        size = 25
    else:
        size = 50
    
    intervals = np.sort(np.random.choice(nums, size = size, replace = False))
    
    intervals = [0] + list(intervals) + [sleep.max()]
    
    plt.hist(sleep, 
             bins = intervals,
            ec = 'black',
            density = True)
    plt.yticks([])
    
    plt.gcf().set_size_inches(4,3)
    plt.xlabel("Hours of Sleep")
    plt.tight_layout()
    plt.savefig(f"hist{i:.0f}.pdf")
    
    
    plt.show()

In [None]:
df_sample.to_csv("ATUS_summary_sample.csv")

In [None]:
sleeping = ATUS['sum'].t010101/60 #.hist(bins = 1400)
sleeping = sleeping[sleeping > 7]
sleeping = sleeping[sleeping < 9]

sleeping.hist(bins = 121)
plt.gca().set_yticks([])
plt.xlabel("Hours of Sleep")
plt.tight_layout()
plt.savefig("hist_sleep_small_bins.pdf")