In [49]:
import pandas as pd
from pathlib import Path

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

In [50]:
this_file_dir = Path(".").resolve()
bld = this_file_dir / "bld"

In [51]:
raw = pd.read_stata(bld / "BEHAVIOR_PROBLEMS_INDEX.dta")
info = pd.read_csv(bld / "bpi_variable_info.csv")

In [52]:
raw.head()

Unnamed: 0,C0000100,C0000200,C0005800,C0564000,C0564100,C0564200,C0564300,C0564400,C0564500,C0564600,...,Y3249000,Y3249001,Y3249100,Y3249101,Y3249200,Y3249201,Y3249300,Y3249301,Y3249400,Y3249401
0,201.0,2.0,1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
1,202.0,2.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
2,301.0,3.0,1,NOT TRUE,NOT TRUE,NOT TRUE,NOT TRUE,SOMETIMES TRUE,NOT TRUE,NOT TRUE,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
3,302.0,3.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
4,303.0,3.0,3,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,Not True,Not True,Not True,Not True,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0


In [53]:
info.head()

Unnamed: 0,nlsy_name,readable_name,survey_year,label
0,C0000100,childid,invariant,id code of child
1,C0000200,momid,invariant,id code of mother of child
2,C0005800,birth_order,invariant,birth order of child
3,C0564000,anxiety_mood,1986,ch has sud chgs in mood/feelng
4,C0564100,anxiety_complain,1986,ch cmplns no one loves him/her


In [64]:
def _clean_bpi_variables(raw_df, info_df):
     raw_df = raw_df[info_df["nlsy_name"]]  # choosing variables that are available in info 
     clean_variables = dict(zip(info_df.nlsy_name, info_df.readable_name +  ' ' + info_df.survey_year)) # creating a dictionary to rename columns in raw data
     return raw_df.rename(columns=clean_variables)

In [65]:
def _clean_bpi_cat(sr):
    sr = sr.replace([-7.0, -3.0, -2.0, -1.0], pd.NA)
    sr = sr.replace({'Never Attended School': pd.NA, 'Multiple selection': pd.NA })
    categories = ["not true", "sometimes true", "often true"]
    sr = sr.astype(pd.StringDtype()).str.lower().astype(pd.CategoricalDtype(categories=categories, ordered=True))
    return sr

In [66]:
def clean_year_data(raw, info):
    df = pd.DataFrame(index=raw.index)
    df = _clean_bpi_variables(raw,info)
    for i in df.columns[:2]:
        df[i]= df[i].astype(pd.Int32Dtype()).astype(pd.CategoricalDtype())
    for i in df.columns[3:]:
        df[i] = _clean_bpi_cat(df[i])
    return df


In [67]:
clean_year_data(raw, info)

Unnamed: 0,childid invariant,momid invariant,birth_order invariant,anxiety_mood 1986,anxiety_complain 1986,headstrong_tense 1986,antisocial_cheat 1986,anxiety_fearful 1986,headstrong_argues 1986,hyperactive_concentration 1986,...,dependent_clings 2010,dependent_cries 2010,dependent_attention 2010,dependent_dependent 2010,additional_gethim 2010,additional_hangout 2010,additional_secretive 2010,additional_worries 2010,antisocial_disob_school 2010,antisocial_teachers 2010
0,201,2,1,,,,,,,,...,,,,,,,,,,
1,202,2,2,,,,,,,,...,,,,,,,,,,
2,301,3,1,not true,not true,not true,not true,sometimes true,not true,not true,...,,,,,,,,,,
3,302,3,2,,,,,,,,...,,,,,,,,,,
4,303,3,3,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11516,1267201,12672,1,,,,,,,,...,,,,,,,,,,
11517,1267202,12672,2,,,,,,,,...,,,,,,,,,,
11518,1267301,12673,1,,,,,,,,...,,,,,,,,,,
11519,1267302,12673,2,,,,,,,,...,,,,,,,,,,


In [68]:
df = clean_year_data(raw, info)
df

Unnamed: 0,childid invariant,momid invariant,birth_order invariant,anxiety_mood 1986,anxiety_complain 1986,headstrong_tense 1986,antisocial_cheat 1986,anxiety_fearful 1986,headstrong_argues 1986,hyperactive_concentration 1986,...,dependent_clings 2010,dependent_cries 2010,dependent_attention 2010,dependent_dependent 2010,additional_gethim 2010,additional_hangout 2010,additional_secretive 2010,additional_worries 2010,antisocial_disob_school 2010,antisocial_teachers 2010
0,201,2,1,,,,,,,,...,,,,,,,,,,
1,202,2,2,,,,,,,,...,,,,,,,,,,
2,301,3,1,not true,not true,not true,not true,sometimes true,not true,not true,...,,,,,,,,,,
3,302,3,2,,,,,,,,...,,,,,,,,,,
4,303,3,3,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11516,1267201,12672,1,,,,,,,,...,,,,,,,,,,
11517,1267202,12672,2,,,,,,,,...,,,,,,,,,,
11518,1267301,12673,1,,,,,,,,...,,,,,,,,,,
11519,1267302,12673,2,,,,,,,,...,,,,,,,,,,


In [70]:
info.readable_name.unique()

<ArrowStringArrayNumpySemantics>
[                  'childid',                     'momid',
               'birth_order',              'anxiety_mood',
          'anxiety_complain',          'headstrong_tense',
          'antisocial_cheat',           'anxiety_fearful',
         'headstrong_argues', 'hyperactive_concentration',
      'hyperactive_confused',          'antisocial_bully',
     'headstrong_disob_home',       'antisocial_notsorry',
             'peer_getalong',     'hyperactive_impulsive',
         'anxiety_worthless',             'peer_notliked',
      'hyperactive_thoughts',      'hyperactive_restless',
       'headstrong_stubborn',         'headstrong_temper',
               'anxiety_sad',            'peer_withdrawn',
          'antisocial_break',          'dependent_clings',
           'dependent_cries',       'dependent_attention',
       'dependent_dependent',   'antisocial_disob_school',
       'antisocial_teachers',         'additional_gethim',
        'additional_han

In [80]:
new_df = pd.wide_to_long(df, stubnames= info.readable_name.unique(),i= 'childid invariant', j='year', sep= " ")
new_df = new_df.drop(columns=['childid','momid'])
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,momid invariant,birth_order invariant,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
childid invariant,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
201,1986,2,1,,,,,,,,,...,,,,,,,,,,
202,1986,2,2,,,,,,,,,...,,,,,,,,,,
301,1986,3,1,,not true,not true,not true,not true,sometimes true,not true,not true,...,sometimes true,not true,sometimes true,sometimes true,not true,not true,,,,
302,1986,3,2,,,,,,,,,...,,,,,,,,,,
303,1986,3,3,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267201,2010,12672,1,,,,,,,,,...,,,,,,,,,,
1267202,2010,12672,2,,,,,,,,,...,,,,,,,,,,
1267301,2010,12673,1,,,,,,,,,...,,,,,,,,,,
1267302,2010,12673,2,,,,,,,,,...,,,,,,,,,,


In [86]:
new_df = new_df.sort_index()
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,momid invariant,birth_order invariant,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
childid invariant,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
201,1986,2,1,,,,,,,,,...,,,,,,,,,,
201,1988,2,1,,,,,,,,,...,,,,,,,,,,
201,1990,2,1,,,,,,,,,...,,,,,,,,,,
201,1992,2,1,,,,,,,,,...,,,,,,,,,,
201,1994,2,1,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267501,2002,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2004,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2006,12675,1,,,,,,,,,...,,,,,,,,,,
1267501,2008,12675,1,,,,,,,,,...,,,,,,,,,,


In [87]:
new_df.loc[201]

Unnamed: 0_level_0,momid invariant,birth_order invariant,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,...,dependent_clings,dependent_cries,dependent_attention,dependent_dependent,antisocial_disob_school,antisocial_teachers,additional_gethim,additional_hangout,additional_secretive,additional_worries
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986,2,1,,,,,,,,,...,,,,,,,,,,
1988,2,1,,,,,,,,,...,,,,,,,,,,
1990,2,1,,,,,,,,,...,,,,,,,,,,
1992,2,1,,,,,,,,,...,,,,,,,,,,
1994,2,1,,,,,,,,,...,,,,,,,,,,
1996,2,1,,,,,,,,,...,,,,,,,,,,
1998,2,1,,not true,not true,not true,not true,sometimes true,not true,not true,...,not true,not true,sometimes true,not true,not true,not true,not true,not true,not true,sometimes true
2000,2,1,,not true,sometimes true,not true,not true,sometimes true,not true,sometimes true,...,not true,sometimes true,sometimes true,sometimes true,,,not true,not true,not true,not true
2002,2,1,,not true,not true,not true,sometimes true,not true,not true,not true,...,not true,not true,not true,not true,not true,not true,not true,not true,not true,not true
2004,2,1,,not true,not true,not true,not true,not true,not true,not true,...,not true,not true,not true,not true,not true,not true,not true,not true,not true,not true
