In [2]:
import pandas as pd
from pathlib import Path

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

In [10]:
this_file_dir = Path(".").resolve()
bld = this_file_dir / "bld"

raw = pd.read_stata(bld / "BEHAVIOR_PROBLEMS_INDEX.dta")
info = pd.read_csv(bld / "bpi_variable_info.csv")



In [29]:
def clean_and_reshape_nlsy_data(raw, info):

    cleaned_yearly_data = [clean_year_data(raw,i, info) for i in range(1986,2011,2)]
    return pd.concat(cleaned_yearly_data)

def clean_year_data(raw, year, info):
    df = pd.DataFrame(index=raw.index)
    for i in raw.columns[:3]:
        df[i] = raw[i]
    year_list = _filter_by_year(raw,year,info)
    df[year_list] = raw[year_list]
    df['year'] = year
    df = _clean_bpi_variables(df,info)
    df["childid"] = _change_data_types(df["childid"])
    df["momid"] = _change_data_types(df["momid"])
    df["birth_order"] = df["birth_order"].cat.codes
    df  = df.set_index(["childid","year"])
    for i in df.columns[2:]:
        df[i] = _clean_bpi_cat(df[i])
    
    df = pd.concat([df, _add_subscale_scores(df)], axis=1)
    return df.sort_index()

def _change_data_types(sr):
    return sr.astype(pd.UInt64Dtype())

def _filter_by_year(raw_df,year,info):
    info_by_year = info.loc[info.survey_year == str(year)]
    nlsy_by_year = info_by_year.loc[:,"nlsy_name"].to_list()
    return nlsy_by_year

def _clean_bpi_variables(df, info_df):
     clean_variables = dict(zip(info_df.nlsy_name, info_df.readable_name)) # creating a dictionary to rename columns in raw data
     return df.rename(columns=clean_variables)

def _clean_bpi_cat(sr):
    sr = sr.replace([-7.0, -3.0, -2.0, -1.0], pd.NA)
    sr = sr.replace({'Never Attended School': pd.NA, 'Multiple selection': pd.NA })
    categories = ["not true", "sometimes true", "often true"]
    sr = sr.astype(pd.StringDtype()).str.lower().astype(pd.CategoricalDtype(categories=categories, ordered=True))
    return sr

def _add_subscale_scores(df):
    mapping_dict = {
    'not true': 0,
    'sometimes true': 1,
    'often true': 1
    }
    subscale = df.copy()
    for i in subscale.columns[2:]:
        subscale[i] = subscale[i].map(mapping_dict)
    
    categories = ["antisocial", "anxiety", "headstrong", "hyperactive", "dependence","peer"]
    for i in categories:
        subscale[i] = subscale[[col for col in subscale.columns if col.startswith(i)]].mean(axis=1)
    subscale = subscale[categories]
    return subscale

In [30]:
raw.dtypes

C0000100     float32
C0000200     float32
C0005800    category
C0564000    category
C0564100    category
              ...   
Y3249201    category
Y3249300    category
Y3249301    category
Y3249400    category
Y3249401    category
Length: 1582, dtype: object

In [31]:
df = clean_year_data(raw, 1998, info)

In [32]:
df.dtypes

momid                          UInt64
birth_order                      int8
anxiety_mood                 category
anxiety_complain             category
headstrong_tense             category
antisocial_cheat             category
anxiety_fearful              category
headstrong_argues            category
hyperactive_concentration    category
hyperactive_confused         category
antisocial_bully             category
headstrong_disob_home        category
antisocial_notsorry          category
peer_getalong                category
hyperactive_impulsive        category
anxiety_worthless            category
peer_notliked                category
hyperactive_thoughts         category
hyperactive_restless         category
headstrong_stubborn          category
headstrong_temper            category
anxiety_sad                  category
peer_withdrawn               category
antisocial_break             category
dependent_clings             category
dependent_cries              category
dependent_at

In [21]:
raw

Unnamed: 0,C0000100,C0000200,C0005800,C0564000,C0564100,C0564200,C0564300,C0564400,C0564500,C0564600,...,Y3249000,Y3249001,Y3249100,Y3249101,Y3249200,Y3249201,Y3249300,Y3249301,Y3249400,Y3249401
0,201.0,2.0,1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
1,202.0,2.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
2,301.0,3.0,1,NOT TRUE,NOT TRUE,NOT TRUE,NOT TRUE,SOMETIMES TRUE,NOT TRUE,NOT TRUE,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
3,302.0,3.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
4,303.0,3.0,3,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,Not True,Not True,Not True,Not True,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11516,1267201.0,12672.0,1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11517,1267202.0,12672.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11518,1267301.0,12673.0,1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11519,1267302.0,12673.0,2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,...,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0


In [20]:
df = clean_and_reshape_nlsy_data(raw,info)

TypeError: cannot safely cast non-equivalent object to uint32

In [18]:
df.dtypes

momid                          UInt32
birth_order                  category
anxiety_mood                 category
anxiety_complain             category
headstrong_tense             category
antisocial_cheat             category
anxiety_fearful              category
headstrong_argues            category
hyperactive_concentration    category
hyperactive_confused         category
antisocial_bully             category
headstrong_disob_home        category
antisocial_notsorry          category
peer_getalong                category
hyperactive_impulsive        category
anxiety_worthless            category
peer_notliked                category
hyperactive_thoughts         category
hyperactive_restless         category
headstrong_stubborn          category
headstrong_temper            category
anxiety_sad                  category
peer_withdrawn               category
antisocial_break             category
dependent_clings             category
dependent_cries              category
dependent_at

In [64]:
info[info["survey_year"]=='1986']["nlsy_name"]

3     C0564000
4     C0564100
5     C0564200
6     C0564300
7     C0564400
8     C0564500
9     C0564600
10    C0564700
11    C0564800
12    C0564900
13    C0565000
14    C0565100
15    C0565200
16    C0565300
17    C0565400
18    C0565500
19    C0565600
20    C0565700
21    C0565800
22    C0565900
23    C0566000
24    C0566100
25    C0566200
26    C0566300
27    C0566400
28    C0566500
29    C0566600
30    C0566700
Name: nlsy_name, dtype: string

In [None]:
df = pd.DataFrame(index=raw.index)
wanted_info = info.loc[info.survey_year == 'year']
wanted_list = wanted_info.loc[:,"nlsy_name"].to_list()

In [105]:
wanted_info = info.loc[info.survey_year == '2010']
wanted_info.loc[:,"nlsy_name"].to_list()

['C5189700',
 'C5189800',
 'C5189900',
 'C5190000',
 'C5190100',
 'C5190200',
 'C5190300',
 'C5190400',
 'C5190500',
 'C5190600',
 'C5190700',
 'C5190800',
 'C5190900',
 'C5191000',
 'C5191100',
 'C5191200',
 'C5191300',
 'C5191400',
 'C5191500',
 'C5191600',
 'C5191700',
 'C5191800',
 'C5191900',
 'C5192000',
 'C5192100',
 'C5192200',
 'C5192300',
 'C5192400',
 'C5192500',
 'C5192600',
 'C5192800',
 'C5192900']

In [106]:
raw[['C5189700',
 'C5189800',
 'C5189900',
 'C5190000',
 'C5190100',
 'C5190200',
 'C5190300',
 'C5190400',
 'C5190500',
 'C5190600']]

Unnamed: 0,C5189700,C5189800,C5189900,C5190000,C5190100,C5190200,C5190300,C5190400,C5190500,C5190600
0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
1,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
2,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
3,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
4,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
...,...,...,...,...,...,...,...,...,...,...
11516,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11517,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11518,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0
11519,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0,-7.0


In [95]:
wanted_info.loc[:,"nlsy_name"]

383    C5189700
384    C5189800
385    C5189900
386    C5190000
387    C5190100
388    C5190200
389    C5190300
390    C5190400
391    C5190500
392    C5190600
393    C5190700
394    C5190800
395    C5190900
396    C5191000
397    C5191100
398    C5191200
399    C5191300
400    C5191400
401    C5191500
402    C5191600
403    C5191700
404    C5191800
405    C5191900
406    C5192000
407    C5192100
408    C5192200
409    C5192300
410    C5192400
411    C5192500
412    C5192600
413    C5192800
414    C5192900
Name: nlsy_name, dtype: string

In [160]:
def _clean_bpi_variables(df, info_df):
     clean_variables = dict(zip(info_df.nlsy_name, info_df.readable_name)) # creating a dictionary to rename columns in raw data
     return df.rename(columns=clean_variables)

In [137]:
def clean_year(raw,year,info):
    df = pd.DataFrame(index=raw.index)
    for i in raw.columns[:3]:
        df[i] = raw[i]
    wanted_info = info.loc[info.survey_year == year]
    wanted_list = wanted_info.loc[:,"nlsy_name"].to_list()
    # return df[info[info["survey_year"]=='year']["nlsy_name"]]
    df[wanted_list] = raw[wanted_list]
    df["year"] = year
    df = _clean_bpi_variables(df,info)
    df  = df.set_index(["childid","year"])
    for i in df.columns[2:]:
        df[i] = _clean_bpi_cat(df[i])
    return df

In [161]:
def clean_year_data(raw, year, info):
    df = pd.DataFrame(index=raw.index)
    for i in raw.columns[:3]:
        df[i] = raw[i]
    year_list = _filter_by_year(raw,year,info)
    df[year_list] = raw[year_list]
    df['year'] = year
    df = _clean_bpi_variables(df,info)
    df["childid"] = _change_data_types(df["childid"])
    df["momid"] = _change_data_types(df["momid"])
    df  = df.set_index(["childid","year"])
    for i in df.columns[2:]:
        df[i] = _clean_bpi_cat(df[i])
    
    df = pd.concat([df, _add_subscale_scores(df)], axis=1)

    return df

In [162]:
def _change_data_types(sr):
    return sr.astype(pd.UInt32Dtype())

def _filter_by_year(raw_df,year,info):
    info_by_year = info.loc[info.survey_year == str(year)]
    nlsy_by_year = info_by_year.loc[:,"nlsy_name"].to_list()
    return nlsy_by_year

In [159]:
_filter_by_year(raw,2004,info)

['C3009700',
 'C3009800',
 'C3009900',
 'C3010000',
 'C3010100',
 'C3010200',
 'C3010300',
 'C3010400',
 'C3010500',
 'C3010600',
 'C3010700',
 'C3010800',
 'C3010900',
 'C3011000',
 'C3011100',
 'C3011200',
 'C3011300',
 'C3011400',
 'C3011500',
 'C3011600',
 'C3011700',
 'C3011800',
 'C3011900',
 'C3012000',
 'C3012100',
 'C3012200',
 'C3012300',
 'C3012400',
 'C3012500',
 'C3012600',
 'C3012800',
 'C3012900']

In [163]:
def _clean_bpi_cat(sr):
    sr = sr.replace([-7.0, -3.0, -2.0, -1.0], pd.NA)
    sr = sr.replace({'Never Attended School': pd.NA, 'Multiple selection': pd.NA })
    categories = ["not true", "sometimes true", "often true"]
    sr = sr.astype(pd.StringDtype()).str.lower().astype(pd.CategoricalDtype(categories=categories, ordered=True))
    return sr

In [164]:
def _add_subscale_scores(df):
    mapping_dict = {
    'not true': 0,
    'sometimes true': 1,
    'often true': 1
    }
    subscale = df.copy()
    for i in subscale.columns[2:]:
        subscale[i] = subscale[i].map(mapping_dict)
    
    categories = ["antisocial", "anxiety", "headstrong", "hyperactive", "dependence","peer"]
    for i in categories:
        subscale[i] = subscale[[col for col in subscale.columns if col.startswith(i)]].mean(axis=1)
    subscale = subscale[categories]
    return subscale

In [166]:
clean_year_data(raw,2004,info)

Unnamed: 0_level_0,Unnamed: 1_level_0,momid,birth_order,anxiety_mood,anxiety_complain,headstrong_tense,antisocial_cheat,anxiety_fearful,headstrong_argues,hyperactive_concentration,hyperactive_confused,...,additional_secretive,additional_worries,antisocial_disob_school,antisocial_teachers,antisocial,anxiety,headstrong,hyperactive,dependence,peer
childid,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
201,2004,2,1,not true,not true,not true,not true,not true,not true,not true,not true,...,not true,not true,not true,not true,0.000000,0.0,0.0,0.0,,0.0
202,2004,2,2,not true,not true,not true,not true,not true,not true,not true,not true,...,not true,not true,not true,not true,0.166667,0.0,0.0,0.0,,0.0
301,2004,3,1,,,,,,,,,...,,,,,,,,,,
302,2004,3,2,,,,,,,,,...,,,,,,,,,,
303,2004,3,3,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267201,2004,12672,1,,,,,,,,,...,,,,,,,,,,
1267202,2004,12672,2,,,,,,,,,...,,,,,,,,,,
1267301,2004,12673,1,,,,,,,,,...,,,,,,,,,,
1267302,2004,12673,2,,,,,,,,,...,,,,,,,,,,


In [156]:
clean_year_data(raw,2004,info)

Unnamed: 0_level_0,Unnamed: 1_level_0,momid,birth_order,antisocial,anxiety,headstrong,hyperactive,dependence,peer
childid,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201,2004,2,1,,,,,,
202,2004,2,2,,,,,,
301,2004,3,1,,,,,,
302,2004,3,2,,,,,,
303,2004,3,3,,,,,,
...,...,...,...,...,...,...,...,...,...
1267201,2004,12672,1,,,,,,
1267202,2004,12672,2,,,,,,
1267301,2004,12673,1,,,,,,
1267302,2004,12673,2,,,,,,


In [1]:
df

NameError: name 'df' is not defined