In [358]:
import pandas as pd
from pathlib import Path

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

In [359]:
this_file_dir = Path(".").resolve()
bld = this_file_dir / "bld"

In [360]:
raw = pd.read_stata(bld / "BEHAVIOR_PROBLEMS_INDEX.dta")
info = pd.read_csv(bld / "bpi_variable_info.csv")

In [361]:
raw.head()

Unnamed: 0,C0000100,C0000200,C0005800,C0564000,C0564100,C0564200,C0564300,C0564400,C0564500,C0564600,...,Y3249000,Y3249001,Y3249100,Y3249101,Y3249200,Y3249201,Y3249300,Y3249301,Y3249400,Y3249401
0,201.0,2.0,1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
1,202.0,2.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
2,301.0,3.0,1,NOT TRUE,NOT TRUE,NOT TRUE,NOT TRUE,SOMETIMES TRUE,NOT TRUE,NOT TRUE,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
3,302.0,3.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
4,303.0,3.0,3,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,Not True,Not True,Not True,Not True,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0


In [362]:
info.head()

Unnamed: 0,nlsy_name,readable_name,survey_year,label
0,C0000100,childid,invariant,id code of child
1,C0000200,momid,invariant,id code of mother of child
2,C0005800,birth_order,invariant,birth order of child
3,C0564000,anxiety_mood,1986,ch has sud chgs in mood/feelng
4,C0564100,anxiety_complain,1986,ch cmplns no one loves him/her


In [363]:
info_T = info.T
info_T.columns = info_T.iloc[0]
info_T = info_T.drop(index=["nlsy_name"])
info_T = info_T.rename_axis(None, axis=1)
info_T


Unnamed: 0,C0000100,C0000200,C0005800,C0564000,C0564100,C0564200,C0564300,C0564400,C0564500,C0564600,...,C5191900,C5192000,C5192100,C5192200,C5192300,C5192400,C5192500,C5192600,C5192800,C5192900
readable_name,childid,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,additional_gethim,additional_hangout,additional_secretive,additional_worries,antisocial_disob_school,antisocial_teachers
survey_year,invariant,invariant,invariant,1986,1986,1986,1986,1986,1986,1986,...,2010,2010,2010,2010,2010,2010,2010,2010,2010,2010
label,id code of child,id code of mother of child,birth order of child,ch has sud chgs in mood/feelng,ch cmplns no one loves him/her,"ch is high strung, tense, nerv",child cheats or tells lies,ch is too fearful or anxious,child argues too much,ch has diff concentrating,...,child clings to adults,child cries too much,ch demands a lot of attention,child too dependent on others,feels othrs out to get him/her,hang arnd kids who get in trbl,child is secretive,child worries too much,ch is disobedient at school,trouble getting along w/tchrs


In [364]:
def _clean_bpi_variables(raw_df, info_df):
     raw_df = raw_df[info_df["nlsy_name"]]  # choosing variables that are available in info 
     clean_variables = dict(zip(info_df.nlsy_name, info_df.readable_name +  ' ' + info_df.survey_year)) # creating a dictionary to rename columns in raw data
     return raw_df.rename(columns=clean_variables)

In [365]:
def _clean_bpi_cat(sr):
    sr = sr.replace([-7.0, -3.0, -2.0, -1.0], pd.NA)
    sr = sr.replace({'Never Attended School': pd.NA, 'Multiple selection': pd.NA })
    categories = ["not true", "sometimes true", "often true"]
    sr = sr.astype(pd.StringDtype()).str.lower().astype(pd.CategoricalDtype(categories=categories, ordered=True))
    return sr

In [366]:
def clean_year_data(raw, info):
    df = pd.DataFrame(index=raw.index)
    df = _clean_bpi_variables(raw,info)
    for i in df.columns[:2]:
        df[i]= df[i].astype(pd.Int32Dtype()).astype(pd.CategoricalDtype())
    for i in df.columns[3:]:
        df[i] = _clean_bpi_cat(df[i])
    return df


In [367]:
clean_year_data(raw, info)

Unnamed: 0,childid invariant,momid invariant,birth_order invariant,anxiety_mood 1986,anxiety_complain 1986,headstrong_tense 1986,antisocial_cheat 1986,anxiety_fearful 1986,headstrong_argues 1986,hyperactive_concentration 1986,...,dependent_clings 2010,dependent_cries 2010,dependent_attention 2010,dependent_dependent 2010,additional_gethim 2010,additional_hangout 2010,additional_secretive 2010,additional_worries 2010,antisocial_disob_school 2010,antisocial_teachers 2010
0,201,2,1,,,,,,,,...,,,,,,,,,,
1,202,2,2,,,,,,,,...,,,,,,,,,,
2,301,3,1,not true,not true,not true,not true,sometimes true,not true,not true,...,,,,,,,,,,
3,302,3,2,,,,,,,,...,,,,,,,,,,
4,303,3,3,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11516,1267201,12672,1,,,,,,,,...,,,,,,,,,,
11517,1267202,12672,2,,,,,,,,...,,,,,,,,,,
11518,1267301,12673,1,,,,,,,,...,,,,,,,,,,
11519,1267302,12673,2,,,,,,,,...,,,,,,,,,,


In [368]:
df = clean_year_data(raw, info)
df

Unnamed: 0,childid invariant,momid invariant,birth_order invariant,anxiety_mood 1986,anxiety_complain 1986,headstrong_tense 1986,antisocial_cheat 1986,anxiety_fearful 1986,headstrong_argues 1986,hyperactive_concentration 1986,...,dependent_clings 2010,dependent_cries 2010,dependent_attention 2010,dependent_dependent 2010,additional_gethim 2010,additional_hangout 2010,additional_secretive 2010,additional_worries 2010,antisocial_disob_school 2010,antisocial_teachers 2010
0,201,2,1,,,,,,,,...,,,,,,,,,,
1,202,2,2,,,,,,,,...,,,,,,,,,,
2,301,3,1,not true,not true,not true,not true,sometimes true,not true,not true,...,,,,,,,,,,
3,302,3,2,,,,,,,,...,,,,,,,,,,
4,303,3,3,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11516,1267201,12672,1,,,,,,,,...,,,,,,,,,,
11517,1267202,12672,2,,,,,,,,...,,,,,,,,,,
11518,1267301,12673,1,,,,,,,,...,,,,,,,,,,
11519,1267302,12673,2,,,,,,,,...,,,,,,,,,,


In [369]:
info.readable_name.unique()

<ArrowStringArrayNumpySemantics>
[                  'childid',                     'momid',
               'birth_order',              'anxiety_mood',
          'anxiety_complain',          'headstrong_tense',
          'antisocial_cheat',           'anxiety_fearful',
         'headstrong_argues', 'hyperactive_concentration',
      'hyperactive_confused',          'antisocial_bully',
     'headstrong_disob_home',       'antisocial_notsorry',
             'peer_getalong',     'hyperactive_impulsive',
         'anxiety_worthless',             'peer_notliked',
      'hyperactive_thoughts',      'hyperactive_restless',
       'headstrong_stubborn',         'headstrong_temper',
               'anxiety_sad',            'peer_withdrawn',
          'antisocial_break',          'dependent_clings',
           'dependent_cries',       'dependent_attention',
       'dependent_dependent',   'antisocial_disob_school',
       'antisocial_teachers',         'additional_gethim',
        'additional_han

In [370]:
new_df = pd.wide_to_long(df, stubnames= info.readable_name.unique(),i= 'childid invariant', j='year', sep= " ")
new_df = new_df.drop(columns=['childid','momid','birth_order'])

In [371]:
new_df.index.names = ['childid', 'year']
new_df = new_df.rename(columns= {'momid invariant': 'momid', 'birth_order invariant': 'birth_order'})

In [372]:
new_df = new_df.sort_index()
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,hyperactive_confused,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
childid,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
201,1986,2,1,,,,,,,,,...,,,,,,,,,,
201,1988,2,1,,,,,,,,,...,,,,,,,,,,
201,1990,2,1,,,,,,,,,...,,,,,,,,,,
201,1992,2,1,,,,,,,,,...,,,,,,,,,,
201,1994,2,1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267501,2002,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2004,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2006,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2008,12675,1,,,,,,,,,...,,,,,,,,,,


In [373]:
new_df.loc[201]

Unnamed: 0_level_0,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,hyperactive_confused,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986,2,1,,,,,,,,,...,,,,,,,,,,
1988,2,1,,,,,,,,,...,,,,,,,,,,
1990,2,1,,,,,,,,,...,,,,,,,,,,
1992,2,1,,,,,,,,,...,,,,,,,,,,
1994,2,1,,,,,,,,,...,,,,,,,,,,
1996,2,1,,,,,,,,,...,,,,,,,,,,
1998,2,1,not true,not true,not true,not true,sometimes true,not true,not true,not true,...,not true,not true,sometimes true,not true,not true,not true,not true,not true,not true,sometimes true
2000,2,1,not true,sometimes true,not true,not true,sometimes true,not true,sometimes true,not true,...,not true,sometimes true,sometimes true,sometimes true,,,not true,not true,not true,not true
2002,2,1,not true,not true,not true,sometimes true,not true,not true,not true,not true,...,not true,not true,not true,not true,not true,not true,not true,not true,not true,not true
2004,2,1,not true,not true,not true,not true,not true,not true,not true,not true,...,not true,not true,not true,not true,not true,not true,not true,not true,not true,not true


In [374]:
mapping_dict = {
    'not true': 0,
    'sometimes true': 1,
    'often true': 1
}
for i in new_df.columns[2:]:
    new_df[i] = new_df[i].map(mapping_dict)

In [375]:
new_df.loc[201]

Unnamed: 0_level_0,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,hyperactive_confused,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986,2,1,,,,,,,,,...,,,,,,,,,,
1988,2,1,,,,,,,,,...,,,,,,,,,,
1990,2,1,,,,,,,,,...,,,,,,,,,,
1992,2,1,,,,,,,,,...,,,,,,,,,,
1994,2,1,,,,,,,,,...,,,,,,,,,,
1996,2,1,,,,,,,,,...,,,,,,,,,,
1998,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2000,2,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,,,0.0,0.0,0.0,0.0
2002,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2004,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [376]:
categories = ["antisocial", "anxiety", "headstrong", "hyperactive", "dependence","peer"]

for i in categories:
    new_df[i] = new_df[[col for col in new_df.columns if col.startswith(i)]].mean(axis=1)

In [377]:
new_df.loc[201]

Unnamed: 0_level_0,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,hyperactive_confused,...,additional_gethim,additional_hangout,additional_secretive,additional_worries,antisocial,anxiety,headstrong,hyperactive,dependence,peer
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986,2,1,,,,,,,,,...,,,,,,,,,,
1988,2,1,,,,,,,,,...,,,,,,,,,,
1990,2,1,,,,,,,,,...,,,,,,,,,,
1992,2,1,,,,,,,,,...,,,,,,,,,,
1994,2,1,,,,,,,,,...,,,,,,,,,,
1996,2,1,,,,,,,,,...,,,,,,,,,,
1998,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.4,,0.0
2000,2,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.4,,0.333333
2002,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,,0.0
2004,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


In [379]:
new_df.loc[201,'anxiety_mood']

year
1986    NaN
1988    NaN
1990    NaN
1992    NaN
1994    NaN
1996    NaN
1998    0.0
2000    0.0
2002    0.0
2004    0.0
2006    0.0
2008    NaN
2010    NaN
Name: anxiety_mood, dtype: float64