## 

Note:This example follows the CRISP-DM framework

In [17]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import sys

In [4]:
def retrieve_password(row,column=1, print=False):
    '''
    function that retrieves passwords stored locally. takes in two variables of key name stored in password master record: 
    1)row,
    2)column
    
    If print=True [Not recommended] is passed to the function, the passwd will be printed out to verify.  
    
    '''
    file=os.path.join(os.path.expanduser("~"), '.secret')
    secret=pd.read_csv(file, sep="=", header=None).iloc[row,column]
    if print=='True':
        print('\n \n password obtained: ', secret)
    return secret

In [5]:
#retrieve ENVIRONMENT keys
def retrieve_environ_key(key_name, row):
    '''
    function that retrieves environmental keys. takes in two variables: 
    1)key_name, as stored in environ 
    2)row, of key name stored in password master record
    
    If current session has keys stored, it retrieves names requested
    if keys ARE NOT stored in current session it calls the 'retrieve_password' function with row provided
    '''
    if key_name not in os.environ:
        print("Environment variable does not exist in current session, loading variables")
        passwd=retrieve_password(row,1)
        os.environ[key_name]=passwd
        try: 
            open_api_key=os.environ.get(key_name, 'Does not exist')
            print(key_name,'found')
        except:
            print('Unable to find key')
    else: 
        print(key_name, 'found' )

In [91]:
#import personal library my_tollkit
retrieve_environ_key('MY_TOOLS_LOC',6)
my_tools_loc=os.environ.get('MY_TOOLS_LOC')
cwd=os.getcwd()

if not os.path.exists(os.path.join(os.getcwd(),'my_toolkit')):
    print('Error: library not found. creating symlink in cwd')
    !ln -s $my_tools_loc $cwd
    
else:
    #local packages
    import my_toolkit as my
    import importlib
    importlib.reload(my)
    print('library reloaded')

MY_TOOLS_LOC found
library reloaded


In [50]:
## Variables
datasets='/Volumes/Datasets/mental_health/data'

# 1- BUSINESS UNDERSTANDING: 
Identifying the problem and how to solve it

Mental health is an important aspect of human health. Understanding data sources available and general guidelines can help uncover the problems. 

# 2- DATA UNDERSTANDING
analyze available datasers to decide weather or not we need to collect additional data

|Id|Source|Description from CDC website|
|--|--|--|
|df_cdc_1_brfss|[CDC - Behavioral Risk Factor Surveillance System](https://www.cdc.gov/brfss/index.html)|BRFSS collects information on health risk behaviors, preventative practices, and healthcare access. Questions include recent mentally unhealthy days, anxiety and depressive disorders, mental illness and stigma, and psychological distress.|
|df_cdc_2_hps_mh|[CDC - Household Pulse Survey-Mental Health](https://www.cdc.gov/nchs/covid19/health-care-access-and-mental-health.htm)|monitor trends in mental health, health insurance coverage, and problems accessing care|
|df_cdc_2_hps_ad|[CDC - Household Pulse Survey - Anxiety and Depression](https://data.cdc.gov/NCHS/Indicators-of-Anxiety-or-Depression-Based-on-Repor/8pt5-q6wp)|NHIS collects data on both adult and children’s mental health and mental disorders. For adults, this includes serious psychological distress and feelings of depression and anxiety.The NHIS also examines mental health service use and whether individuals have unmet mental health needs. Questions about recent anxiety or frequent stress have been included in previous years.|
|df_cdc_3_nhis|[The National Health Interview Survey (NHIS)](https://www.cdc.gov/nchs/nhis/2022nhis.htm)|NHIS is the principal source of information on the health of the civilian noninstitutionalized population of the United States and is one of the major data collection programs of the National Center for Health Statistics (NCHS) which is part of the Centers for Disease Control and Prevention (CDC).|
|df_cdc_4|[National Post-acute and Long-TEerem Care Study](https://www.cdc.gov/nchs/npals/index.htm)|NPALS monitors trends in the supply, provision, and use of the major sectors of paid, regulated long-term care services. Data cover mental illness, depression, and service use.|
|df_cdc_5|[National Violent Death Reporting System](https://www.cdc.gov/violenceprevention/datasources/nvdrs/index.html)|NVDRS can also provide details on the circumstances that may have led to violent deaths, including mental illness and mental disorders.AVAILABLE. **Data only available by formal requests.** |
|df_cdc_6|[Pregnancy Risk Assessment Monitoring System](https://www.cdc.gov/prams/index.htm)|PRAMS collects data on maternal attitudes and experiences before, during, and after pregnancy. Surveillance research includes the prevalence of self-reported postpartum depression and anxiety symptoms. **NOT AVAILABLE. Data only available by formal requests.** |
|df_cdc_7|[WISQARS](https://www.cdc.gov/injury/wisqars/index.html)|CDC’s WISQARS (Web-based Injury Statistics Query and Reporting System) is an interactive database system that provides customized reports of injury-related data, such as intentional self-harm including suicide.**Manually entered data from [link](https://wisqars.cdc.gov/reports/?)**|
|df_cdc_8|[US Chronic Disease Indicators: Mental Health]()|Summary of CDC's Division of Population Health provides cross-cutting set of 124 indicators that were developed by consensus and that allows states and territories and large metropolitan areas to uniformly define, collect, and report chronic disease data that are important to public health practice |

## Downloading data

In [207]:
def delete_existing_file(d,datasets, folder):
    filename=os.path.join(datasets,folder,'raw.csv')
    !touch $filename && rm $filename 
    d.to_csv(filename)
    !chmod 777 $filename
    print(d.head())

def get_cdc_data(folder, url):

    #building variables
    location= os.path.join(datasets,folder)

    #confirm folder exists
    if os.path.exists(location)==False:
        print('Error: path found. creating folder')
        !mkdir -p $location
        !chmod 777 $location
        print('folder created')


    #get data
    if url[-3:]=='zip':
        import urllib.request as req
        import zipfile
        import io
        
        fin = req.urlopen(url)
        with zipfile.ZipFile(io.BytesIO(fin.read())) as z:
            print(z.namelist())
            for f in z.namelist():
                if f[-3:]=='csv':
                    print('writting csv file')
                    d=pd.read_csv(z.open(f))
                    delete_existing_file(d,datasets, folder)
                else:
                    print('extracting ', f, '\n')
                    z.extract(f,path=os.path.join(datasets,folder))
    elif url[-3:]=='txt':
        import wget
        filename=os.path.join(datasets,folder,url.split('/')[-1])
        #check if file exists
        !touch $filename && rm $filename 
        #writting txt file
        wget.download(url,filename)
        !chmod 777 $filename
        d=pd.DataFrame()#so return doesn't fail
        
        
    else:
        d = pd.read_csv(url)
        delete_existing_file(d,datasets, folder)
    

    return d
    


In [206]:
dict_data={
    
    'df_cdc_1_brfss':"https://data.cdc.gov/resource/dttw-5yxu.csv?$query=SELECT%0A%20%20%60year%60%2C%0A%20%20%60locationabbr%60%2C%0A%20%20%60locationdesc%60%2C%0A%20%20%60class%60%2C%0A%20%20%60topic%60%2C%0A%20%20%60question%60%2C%0A%20%20%60response%60%2C%0A%20%20%60break_out%60%2C%0A%20%20%60break_out_category%60%2C%0A%20%20%60sample_size%60%2C%0A%20%20%60data_value%60%2C%0A%20%20%60confidence_limit_low%60%2C%0A%20%20%60confidence_limit_high%60%2C%0A%20%20%60display_order%60%2C%0A%20%20%60data_value_unit%60%2C%0A%20%20%60data_value_type%60%2C%0A%20%20%60data_value_footnote_symbol%60%2C%0A%20%20%60data_value_footnote%60%2C%0A%20%20%60datasource%60%2C%0A%20%20%60classid%60%2C%0A%20%20%60topicid%60%2C%0A%20%20%60locationid%60%2C%0A%20%20%60breakoutid%60%2C%0A%20%20%60breakoutcategoryid%60%2C%0A%20%20%60questionid%60%2C%0A%20%20%60responseid%60%2C%0A%20%20%60geolocation%60%0AORDER%20BY%0A%20%20%60year%60%20DESC%20NULL%20FIRST%2C%0A%20%20%60locationabbr%60%20ASC%20NULL%20LAST%2C%0A%20%20%60display_order%60%20ASC%20NULL%20LAST"
    ,'df_cdc_2_hps':'https://data.cdc.gov/api/views/8pt5-q6wp/rows.csv?accessType=DOWNLOAD'
    ,'df_cdc_3_nhis':'https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2022/adult22csv.zip'
    ,'df_cdc_4_npals':'https://data.cdc.gov/api/views/wibz-pb5q/rows.csv?accessType=DOWNLOAD'
    ,'df_cdc_5_nvdrs':'https://www.cdc.gov/injury/wisqars/data/Top-Ten-Leading-Causes-of-Death-in-the-U.S.-for-Ages-1-44.txt'
    ,'df_cdc_8_cdimh':"https://data.cdc.gov/api/views/ixrt-gnsg/rows.csv?accessType=DOWNLOAD"
    
}


In [208]:
### getting data
for folder, url in dict_data.items():
    
    df=get_cdc_data(folder, url)

    if df.empty:
        print ('empty dataframe')
    else:
        #cleaning data
        df_tmp, numerical_cols, categorical_cols, multi_valued_cols, binary_cols, single_valued_cols=my.clean.explore_df(df)

        #write to disk
        df_tmp.to_csv(os.path.join(datasets,folder,folder+'.csv'))

#         #print
#         df_tmp



Error: path found. creating folder
folder created
   year locationabbr locationdesc                      class       topic  \
0  2022           AK       Alaska  Chronic Health Indicators  Depression   
1  2022           AK       Alaska  Chronic Health Indicators  Depression   
2  2022           AK       Alaska  Chronic Health Indicators  Depression   
3  2022           AK       Alaska  Chronic Health Indicators  Depression   
4  2022           AK       Alaska  Chronic Health Indicators  Depression   

                                            question response break_out  \
0  Ever told you that you have a form of depression?      Yes   Overall   
1  Ever told you that you have a form of depression?       No   Overall   
2  Ever told you that you have a form of depression?      Yes      Male   
3  Ever told you that you have a form of depression?       No      Male   
4  Ever told you that you have a form of depression?      Yes    Female   

  break_out_category  sample_size  ...  da

unique values:  559
[23.5 32.7 25.7 24.8 23.2 18.4 13.6 14.4 20.8 26.1]
----------
Error: path found. creating folder
folder created
['adult22.csv', 'readme.txt']
writting csv file
   URBRRL  RATCAT_A  INCTCFLG_A  IMPINCFLG_A  SHOTTYPE1_A  CEVOTELC_A  \
0       2         7           0            0          NaN         NaN   
1       4        14           0            0          NaN         NaN   
2       4        14           0            0          NaN         NaN   
3       4        11           0            0          NaN         NaN   
4       1         2           0            1          NaN         NaN   

   CEMMETNG_A  CEVOLUN2_A  CEVOLUN1_A  HITTEST_A  ...  PROXYREL_A  PROXY_A  \
0         NaN         NaN         NaN        NaN  ...         NaN      NaN   
1         NaN         NaN         NaN        NaN  ...         NaN      NaN   
2         NaN         NaN         NaN        NaN  ...         NaN      NaN   
3         NaN         NaN         NaN        NaN  ...         NaN   

[nan  1.  3.  2.  9.]
----------
coloncan_a 

unique values:  4
[nan  2.  1.  9.  7.]
----------
colrccan_a 

unique values:  4
[nan  2.  1.  9.  7.]
----------
combat_a 

unique values:  4
[ 1. nan  2.  9.  7.]
----------
comdiff_a 

unique values:  6
[1 2 3 4 7 9]
----------
copdev_a 

unique values:  4
[2 1 7 9]
----------
cover65_a 

unique values:  7
[ 5. nan  3.  1.  4.  2.  7.  6.]
----------
cover_a 

unique values:  5
[nan  1.  3.  5.  2.  4.]
----------
cquita1_a 

unique values:  3
[nan  2.  1.  9.]
----------
cquita2_a 

unique values:  2
[nan  2.  1.]
----------
cquita3_a 

unique values:  2
[nan  2.  1.]
----------
cquita4_a 

unique values:  3
[nan  2.  1.  9.]
----------
cquita5_a 

unique values:  4
[nan  2.  1.  8.  9.]
----------
cquitb1_a 

unique values:  4
[nan  2.  1.  8.  9.]
----------
cquitb2_a 

unique values:  3
[nan  2.  1.  8.]
----------
cquitb3_a 

unique values:  3
[nan  2.  1.  8.]
----------
cvddiag_a 

unique values:  5
[2 1 8 7 9]
----------
cvdsev_

[nan  2.  1.  8.  9.  7.]
----------
hyp12m_a 

unique values:  3
[nan  2.  1.  9.]
----------
hypdif_a 

unique values:  4
[ 2. nan  1.  9.  7.]
----------
hypev_a 

unique values:  4
[1 2 7 9]
----------
hypmed_a 

unique values:  4
[ 1. nan  2.  9.  7.]
----------
ihs_a 

unique values:  4
[2 7 1 9]
----------
impincflg_a 

unique values:  3
[0 1 2]
----------
impnum_a 

unique values:  1
[1]
----------
incinter_a 

unique values:  5
[2 1 7 8 9]
----------
incothr_a 

unique values:  5
[ 2.  1.  7. nan  9.  8.]
----------
incretire_a 

unique values:  5
[ 1.  7. nan  2.  9.  8.]
----------
incssissdi_a 

unique values:  5
[ 2.  7. nan  1.  9.  8.]
----------
incssrr_a 

unique values:  5
[ 1.  7. nan  2.  9.  8.]
----------
inctcflg_a 

unique values:  2
[0 1]
----------
incwelf_a 

unique values:  5
[ 2.  7. nan  1.  9.  8.]
----------
incwrko_a 

unique values:  5
[2 1 8 7 9]
----------
intv_mon 

unique values:  12
[ 1  2  3  4  5  6  7  8  9 10]
----------
intv_qrt 

unique valu

[nan  2.  1.  9.  7.]
----------
tomsauno_a 

unique values:  31
[  0   4   1   2 998   3   6   5  15  10]
----------
tomsautp_a 

unique values:  7
[0 3 2 8 1 9 7]
----------
transpor_a 

unique values:  5
[2 8 1 7 9]
----------
uppobjct_a 

unique values:  6
[1 2 3 4 7 9]
----------
uppraise_a 

unique values:  6
[1 2 3 4 9 7]
----------
uppslfcr_a 

unique values:  6
[1 2 3 4 9 7]
----------
urbrrl 

unique values:  4
[2 4 1 3]
----------
urgnt12mtc_a 

unique values:  9
[0 1 7 2 5 4 3 8 9]
----------
usplkind_a 

unique values:  9
[ 1. nan  2.  7.  4.  3.  5.  6.  8.  9.]
----------
usualpl_a 

unique values:  6
[1 2 7 3 8 9]
----------
uterucan_a 

unique values:  4
[nan  2.  1.  9.  7.]
----------
vacareev_a 

unique values:  4
[ 2. nan  1.  9.  7.]
----------
vadisb_a 

unique values:  4
[ 2. nan  1.  9.  7.]
----------
vahosp_a 

unique values:  4
[ 2. nan  1.  9.  7.]
----------
vigfreqw_a 

unique values:  26
[94  1  3 98  2  0  6 96 99  7]
----------
viglnr_a 

unique values

binary:  

datasource
datavalueunit


-------------
single_valued:  

topic
topicid


-------------
-------------
Describing columns (showing max 10 unique values):
datasource 

unique values:  2
['PRAMS' 'BRFSS']
----------
datavalue 

unique values:  310
[ nan 12.   8.2  9.4 12.6 18.  12.2 11.3  7.2  9.9]
----------
datavaluealt 

unique values:  310
[ nan 12.   8.2  9.4 12.6 18.  12.2 11.3  7.2  9.9]
----------
datavaluefootnote 

unique values:  4
['No data available' nan
 'Sample size of denominator and/or age group for age-standardization is less than 50 or relative standard error is more than 30%'
 'US estimate/number is based on fewer than 50 states and the District of Columbia'
 '50 States + DC: US Median']
----------
datavaluefootnotesymbol 

unique values:  4
['-' nan '****' '**' '*']
----------
datavaluetype 

unique values:  3
['Crude Prevalence' 'Mean' 'Age-adjusted Mean']
----------
datavaluetypeid 

unique values:  3
['CRDPREV' 'MEAN' 'AGEADJMEAN']
----------
datavalueu

In [None]:
## only for individual datasets
df_tmp, numerical_cols, categorical_cols, multi_valued_cols, binary_cols, single_valued_cols=my.clean.explore_df(df)
#write to disk
df_tmp.to_csv(location+'/'+folder+'.csv')
df_tmp

## Exploring data

In [None]:
na

## Encode

# 3- DATA PREPARATION
transform data into tabular form. separate into train, validate and test

In [None]:
ml=False #mark as True is ML processing will be done on data

## Splitting data

In [None]:
if ml ==True:
else
    print('na')

# 4- MODELING
train the model

In [None]:
if ml ==True:
    f=df_clean.copy()
    ### var def
    percentage_val=0.2
    percentage_test=0.2

    ### sizing
    n=df.shape[0]
    n_val=int(n*percentage_val)
    n_test=int(n*percentage_test)
    n_train=n-(n_val+n_test)

    ### shufflying
    np.random.seed(2)
    idx=np.arange(n)
    np.random.shuffle(idx)
    df_shuffled=df.iloc[idx]

    ### verifying
    print('df \n:',df.iloc[:5,:5])
    print('\n')
    print('shuffled \n:',df_shuffled.iloc[:5,:5])
else
    print('na')

In [None]:
if ml ==True:
    y='msrp' #insert name of column to pop as y
    ### use logaritmic to remove the long tail
    df_train=df_shuffled.iloc[:n_train].copy()
    df_test=df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_valid=df_shuffled.iloc[n_train+n_val:].copy()

    #extract y
    y_train=df_train.pop(y)
    y_test=df_test.pop(y)
    y_valid=df_valid.pop(y)
    print(df_train)

else
    print('na')

# 5- EVALUATION
evaluate the accuracy of the model

In [None]:
if ml ==True:
else
    print('na')

# 6- DEPLOYMENT

In [None]:
if ml ==True:
else
    print('na')

## REFERENCES

<sup>1</sup> [MLBookcamp Book](https://mlbookcamp.com)<br>


In [16]:
## ACRONIMS USED

dic_acr={
    'CRISP-DM':'Cross-Industry Standard Process for Data Mining'
}

counter=1
print('LIST OF ACRONYMS USED \n')
for k, v in dic_acr.items():
    print(counter, '. ',k,'=', v)
    counter=counter+1

LIST OF ACRONYMS USED 

1 .  CRISP-DM = Cross-Industry Standard Process for Data Mining
