In [None]:
import numpy as np
import pandas as pd

### Relevant Variables

Link to Study: <a href='https://www.icpsr.umich.edu/web/ICPSR/series/253/studies'>https://www.icpsr.umich.edu/web/ICPSR/series/253/studies</a>
<table>
    <thead>
        <tr>
            <th scope='col'>Original Variable</th>
            <th scope='col'>Renamed Variable</th>
            <th scope='col'>Description</th>
    </thead>
    <tbody>
        <tr>
            <td>SWANID</td>
            <td>SWANID</td>
            <td>Respondent ID for the study</td>
        <tr>
            <td>AGE[n]</td>
            <td>age</td>
            <td>
                Age at current visit (Integer)<br><br>
                <i>
                    n denotes which dataset the variable belongs to,<br>
                    with 0 being the baseline and 1-10 being each visit
                </i>
            </td>
        <tr>
            <td>RACE</td>
            <td>race</td>
            <td>Race/ethnicity</td>
        <tr>
            <td>STATUS[n]</td>
            <td>meno_status</td>
            <td>
                Menopausal Status<br><br>
                <i>
                    n denotes which dataset the variable belongs to,<br>
                    with 0 being the baseline and 1-10 being each visit
                </i>
            </td>
    </tobdy>
</table>

In [76]:
#Dictionaries for translating variables
race_dict = {1:'Black/African American', 
             2:'Chinese/Chinese American', 
             3:'Japanese/Japnese America', 
             4:'Caucasian/White Non-Hispanic', 
             5:'Hispanic'
             }

meno_status_dict = {'1':'Post by Bilateral Salpingo Oophorectomy', 
                    '2':'Natural post',
                    '3':'Late perimenopause',
                    '4':'Early perimenopause',
                    '5':'Pre-menopausal',
                    '6':'Pregnant/breastfeeding',
                    '7':'Unknown due to hormone therapy use',
                    '8':'Unknown due to hysterectomy',
                    ' ':np.nan
                    }

In [93]:

def basic_cleanup (df):

    '''Translate common data points present in all the studies, namely age, ethnicity, and menopausal status
    
    Parameters:
        df (pandas.DataFrame): Dataframe to be cleaned up, Before running this function, filter the dataframe to only 
        include your relevant columns, and rename the appropriate cloumns to 'age', 'race', and 'meno_status' respectively
    '''

    #Fills empty cells in age with NaN
    df['age'] = df['age'].replace({' ':np.nan})

    #Translate race values
    df['race'] = df['race'].replace(race_dict)

    #Translate menopause status values
    df['meno_status'] = df['meno_status'].replace(meno_status_dict)

    return df

In [117]:
visit_00 = pd.read_csv(r'..\ICPSR\ICPSR_28762 (Baseline Visit 00)\DS0001\Visit_00.tsv', sep='\t')

#Reduce dataframe to only the relevant columns
visit_00 = visit_00.filter(['SWANID', 'AGE0', 'RACE', 'STATUS0'], axis=1)

visit_00 = visit_00.rename(columns={'AGE0':'age', 'RACE':'race', 'STATUS0':'meno_status'})

basic_cleanup(visit_00)

visit_00.meno_status.value_counts()

  visit_00 = pd.read_csv(r'..\ICPSR\ICPSR_28762 (Baseline Visit 00)\DS0001\Visit_00.tsv', sep='\t')


meno_status
Pre-menopausal                        1761
Early perimenopause                   1493
Unknown due to hormone therapy use       6
Name: count, dtype: int64

In [115]:
visit_01 = pd.read_csv(r'..\ICPSR\ICPSR_29221-V3 (Visit 01)\ICPSR_29221\DS0001\Visit_01.tsv', sep='\t')

#Reduce dataframe to only the relevant columns
visit_01 = visit_01.filter(['SWANID', 'AGE1', 'RACE', 'STATUS1'], axis=1)

visit_01 = visit_01.rename(columns={'AGE1':'age', 'RACE':'race', 'STATUS1':'meno_status'})

#Rename columns
basic_cleanup (visit_01)

#Drops every row where respondent doesn't list age or menopause status
visit_01 = visit_01.dropna(how='all', subset=['age', 'meno_status'])

visit_01

Unnamed: 0,SWANID,age,race,meno_status
0,10046,53,Chinese/Chinese American,Unknown due to hormone therapy use
1,10056,52,Caucasian/White Non-Hispanic,Early perimenopause
2,10092,46,Caucasian/White Non-Hispanic,Early perimenopause
3,10126,50,Black/African American,Early perimenopause
4,10153,52,Japanese/Japnese America,Early perimenopause
...,...,...,...,...
2876,99879,45,Black/African American,Early perimenopause
2877,99888,49,Japanese/Japnese America,Unknown due to hormone therapy use
2878,99898,46,Caucasian/White Non-Hispanic,Early perimenopause
2879,99962,48,Chinese/Chinese American,Early perimenopause


In [104]:
visit_02 = pd.read_csv(r'..\ICPSR\ICPSR_29401 (Visit 02)\DS0001\Visit_02.tsv', sep='\t')

#Reduce dataframe to only the relevant columns
visit_02 = visit_02.filter(['SWANID', 'AGE2', 'RACE', 'STATUS2'], axis=1)

#Rename columns
visit_02 = visit_02.rename(columns={'AGE2':'age', 'RACE':'race', 'STATUS2':'meno_status'})

basic_cleanup(visit_02)

#Drops every row where respondent doesn't list age or menopause status
visit_02 = visit_02.dropna(how='all', subset=['age', 'meno_status'])

visit_02

Unnamed: 0,SWANID,age,race,meno_status
0,10046,54,Chinese/Chinese American,Unknown due to hormone therapy use
1,10056,53,Caucasian/White Non-Hispanic,Early perimenopause
2,10126,50,Black/African American,Early perimenopause
3,10153,53,Japanese/Japnese America,Late perimenopause
4,10196,48,Chinese/Chinese American,Early perimenopause
...,...,...,...,...
2743,99809,45,Caucasian/White Non-Hispanic,Unknown due to hormone therapy use
2744,99888,50,Japanese/Japnese America,Early perimenopause
2745,99898,47,Caucasian/White Non-Hispanic,Early perimenopause
2746,99962,49,Chinese/Chinese American,Early perimenopause


In [125]:
visit_03 = pd.read_csv(r'..\ICPSR\ICPSR_29701 (Visit 03)\DS0001\Visit_03.tsv', sep='\t')

#Reduce dataframe to only the relevant columns
visit_03 = visit_03.filter(['SWANID', 'AGE3', 'RACE', 'STATUS3'], axis=1)

#Rename columns
visit_03 = visit_03.rename(columns={'AGE3':'age', 'RACE':'race', 'STATUS3':'meno_status'})

basic_cleanup(visit_03)

#Drops every row where respondent doesn't list age or menopause status
visit_03 = visit_03.dropna(how='all', subset=['age', 'meno_status'])

visit_03

Unnamed: 0,SWANID,age,race,meno_status
0,10046,55,Chinese/Chinese American,Unknown due to hormone therapy use
1,10056,54,Caucasian/White Non-Hispanic,Natural post
2,10126,51,Black/African American,Early perimenopause
3,10153,54,Japanese/Japnese America,Natural post
4,10196,49,Chinese/Chinese American,Early perimenopause
...,...,...,...,...
2704,99809,46,Caucasian/White Non-Hispanic,Unknown due to hormone therapy use
2705,99888,52,Japanese/Japnese America,Early perimenopause
2706,99898,48,Caucasian/White Non-Hispanic,Early perimenopause
2707,99962,50,Chinese/Chinese American,Early perimenopause
