# Final analysis

This notebook contains the final form of our analysis as well as the final form of the data cleaning process.

In [None]:
import glob
import os
import pandas as pd
path = r'/home/daniel/code/costofcare/data/bhp_ukhls_data_stata/UKDA-6614-stata/stata/ukhls'
all_files = glob.glob(os.path.join(path , '*indresp.dta'))
raw_colnames = ['hidp',
                'sex',
                'jbnssec_dv',
                'jbstat',
                'jbhrs',
                'ethn_dv',
                'dvage',
                'mastat_dv',
                'country',
                'aidhh',
                'aidhrs',
                'aidxhh',
                'paygu_dv',
                'qfhigh',
                'qfhigh_dv',
                'fimnlabgrs_dv',
                'istrtdatm',
                'istrtdaty',
                'month']
indresp = []
prefixes = [] # for later
for filename in all_files:
    prefix = filename.split('/')[-1][0:2]
    prefixes.append(prefix)
    colnames = [f'{prefix}{x}' for x in raw_colnames]
    temp_df = pd.read_stata(filename,
                       columns=['pidp'] + colnames)
    indresp.append(temp_df)

In [None]:
for i, df in enumerate(indresp):
    df['wave'] = i+1
    df.columns = ['pidp',
                'hidp',
                'sex',
                'jbnssec_dv',
                'jbstat',
                'jbhrs',
                'ethn_dv',
                'dvage',
                'mastat_dv',
                'country',
                'aidhh',
                'aidhrs',
                'aidxhh',
                'paygu_dv',
                'qfhigh',
                'qfhigh_dv',
                'fimnlabgrs_dv',
                'istrtdatm',
                'istrtdaty',
                'month',
                 'wave']

In [None]:
data = pd.concat(indresp)
data['max_waves'] = data.groupby('pidp')['wave'].transform('count')
len(data.pidp.unique())

In [None]:
check = data[['wave', 'pidp', 'max_waves']]

In [None]:
len(check[check.max_waves > 5].pidp.unique())

## Decision
We will consider that has at least six data points, whether or not they are consecutive or not, and whether or not they start from wave 1.

In [None]:
data_6 = data[data.max_waves > 5]
len(data_6.pidp.unique())

## Decision
Recoding treatment variable

In [None]:
import numpy as np
data_6['aidhh_recoded'] = data_6.aidhh.replace({'Yes': 'yes',
                                                'No': 'no',
                                                'no     ': 'no',
                                                'yes    ': 'yes',
                                                'Not available for IEMB': np.nan,
                                                "don't know": np.nan,
                                                'proxy': np.nan,
                                                'missing': np.nan,
                                                'refusal': np.nan,
                                                'inapplicable': np.nan})

In [None]:
data_6['aidxhh_recoded'] = data_6.aidxhh.replace({'Yes': 'yes',
                                                'No': 'no',
                                                'no     ': 'no',
                                                'yes    ': 'yes',
                                                'Not available for IEMB': np.nan,
                                                "don't know": np.nan,
                                                'proxy': np.nan,
                                                'missing': np.nan,
                                                'refusal': np.nan,
                                                'inapplicable': np.nan})

In [None]:
data_6['aidhrs_recoded_3'] = data_6.aidhrs.str.strip().replace({"inapplicable": np.nan,
                                   "0 - 4 hours per week": "0-19",
                                   "proxy": np.nan,
                                   "5 - 9 hours per week": "0-19",
                                   "10 - 19 hours per week": "0-19",
                                   "100 or more hours per week/continuous care": "50+",
                                   "20 - 34 hours per week": "20-49",
                                   "35 - 49 hours per week": "20-49",
                                   "varies 20 hours or more": "20-49",
                                   "0 - 4 hours per week": "0-19",
                                   "10-19 hours per week": "0-19",
                                   "Not available for IEMB": np.nan,
                                   "Varies 20 hours or more": "20-49",
                                   "varies under 20 hours": "0-19",
                                   "Varies under 20 hours": "0-19",
                                   "20-34 hours per week": "20-49",
                                   "5 - 9 hours per week": "0-19",
                                   "Other": np.nan,
                                   "10 - 19 hours per week": "0-19",
                                   "35-49 hours per week": "20-49",
                                   "100 or more hours per week/continuous care": "50+",
                                   "20 - 34 hours per week": "20-49",
                                   "50-99 hours per week": "50+",
                                   "other": np.nan,
                                   "don't know": np.nan,
                                   "35 - 49 hours per week": "20-49",
                                   "varies under 20 hours": "0-19",
                                   "50 - 99 hours per week": "50+",
                                   "other": np.nan,
                                   "refusal": np.nan}).copy()

In [None]:
data_6['carer'] = ((data_6.aidhh_recoded == 'yes') | (data_6.aidxhh_recoded == 'yes') | (~data_6.aidhrs_recoded_3.isnull()))

## Decision
We will use a "generalised" moment of treatment with at least 3 measures before, and at least 1 after.