In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
import re
%matplotlib inline
plt.style.use('ggplot')
def read_csvs():
    # read csv
    df_i = pd.read_csv('National Longitudinal Study of Adolescent to Adult Health, Wave I Public Use Contextual Database.csv')
    df_ii = pd.read_csv('National Longitudinal Study of Adolescent to Adult Health, Wave II Public Use Contextual Database.csv')
    df_lipids = pd.read_csv('Add Health, Public Use Lipids Data, Wave IV.csv')
    df_glucose = pd.read_csv('Add Health, Public Use Glucose Homeostasis Data, Wave IV.csv')
    df_i.columns = [col.lower() for col in df_i.columns]
    df_ii.columns = [col.lower() for col in df_ii.columns]
    df_lipids.columns = [col.lower() for col in df_lipids.columns]
    df_glucose.columns = [col.lower() for col in df_glucose.columns]
    # merge on ID
    df_m = pd.merge(df_i, df_ii, how='left', left_on=['aid'], right_on=['aid'])
    df_l = pd.merge(df_m, df_lipids, how='left', left_on=['aid'], right_on=['aid'])
    df_final = pd.merge(df_l, df_glucose, how='left', left_on=['aid'], right_on=['aid'])
    df_final = df_final.dropna()
    df_final.head()
    return df_final

def create_predictors(source_df, predictors):
    """
        Each predictor is of the form: ('unemployed', ''bst90p23_y')
    """
    df = pd.DataFrame([])
    for pred in predictors:
        df[pred[0]] = source_df[pred[1]]
    df = pd.concat([df, source_df['aid']], axis=1)
    return df

def add_dependent_var(source_df, pred_df, dependent_tuple):
    pred_df[dependent_tuple[0]] = source_df[dependent_tuple[1]]
#     if dummy_tup[0] in ['income', 'age', 'female_work', 'education']:
#         dummy = dummy.drop(columns=['Unstable estimates'])
    return pred_df

def create_dummies(source_df, pred_df, dummy_tuples):
    for dummy_tup in dummy_tuples:
        print(f"dummy is: {dummy_tup}")
        dummy = pd.get_dummies(source_df[dummy_tup[1]].astype('category'), drop_first=True)
        if 'Unstable estimates' in dummy.columns:
            dummy = dummy.drop(columns=['Unstable estimates'])
        print(dummy)
        print("Got here")
#         if dummy_tup[0] in ['unemployed', 'occupation', 'female_work', 'education']:
#             dummy = dummy.drop(columns=['Unstable estimates'])
        pred_df = pd.concat([pred_df, dummy], axis=1)
    return pred_df

def drop_na_and_clean_up(pred_df):
    pred_df = pred_df.dropna()
    pred_df.columns = [col.replace(' ','_') for col in pred_df.columns]
    pred_df.columns = [col.replace('-','_') for col in pred_df.columns]
    pred_df.columns = [col.replace(',','_') for col in pred_df.columns]
    pred_df['income'] = pred_df['income'].apply(lambda x: np.nan if (x == 'Geocode missing') or (x == 'Unstable estimates') else x)
    pred_df['age'] = pred_df['age'].apply(lambda x: np.nan if (x == 'Geocode missing') or (x == 'Unstable estimates') else x)

    pred_df = pred_df.dropna()
    pred_df['income'] = pred_df['income'].astype(float)
    pred_df['income'] = np.log(pred_df['income'])
    pred_df['age'] = pred_df['age'].astype(float)
    return pred_df

source_df = read_csvs()
# source_df = source_df.applymap(lambda x: np.nan if (x == 'Geocode missing') or (x == 'Unstable estimates') else x)

dummies = [('unemployed', 'bst90p23_y'),
           ('household', 'bst90p13_y'),
           ('education', 'bst90p20_y'),
#            ('race','bst90p02_y'),
#            ('children', 'bst90p10_y'),
           ('female_work', 'bst90p22_y'),
           ('occupation', 'bst90p24_y')]
predictors = dummies + [('age', 'bst90p06_y')]
pred_df = create_predictors(source_df, predictors)
print(f'create_pred {pred_df.columns}')

# Remove all Unstable Estimates rows with applyamp
# pred_df = pred_df.applymap(lambda x: np.nan if (x == 'Geocode missing') or (x == 'Unstable estimates') else x)

pred_df['age'] = pred_df['age'].apply(lambda x: np.nan if (x == 'Geocode missing') | (x == 'Unstable estimates') else x)
print(f'lambda {pred_df.columns}')
pred_df = add_dependent_var(source_df, pred_df, ('income', 'bst90p15_y'))    
print(f'add_dep {pred_df.columns}')
pred_df = create_dummies(source_df, pred_df, dummies)
print(f'create_dummies {pred_df.columns}')
pred_df = drop_na_and_clean_up(pred_df)
print(f'drop_na {pred_df.columns}')
outcome = 'income'
print(f"outcome: {outcome} predictors: {predictors}")
print(f"pred_df: {pred_df.columns}")
predictors = pred_df.drop(['aid','income'], axis=1)
for dummy in dummies:
    predictors = predictors.drop(dummy[0], axis=1)
print(f"predictors: {predictors.columns}")
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum
print(f"formula: {formula}")
model = ols(formula=formula, data=pred_df).fit()
model.summary()
print(f"len pred_df: {len(pred_df)}")

create_pred Index(['unemployed', 'household', 'education', 'female_work', 'occupation',
       'age', 'aid'],
      dtype='object')
lambda Index(['unemployed', 'household', 'education', 'female_work', 'occupation',
       'age', 'aid'],
      dtype='object')
add_dep Index(['unemployed', 'household', 'education', 'female_work', 'occupation',
       'age', 'aid', 'income'],
      dtype='object')
dummy is: ('unemployed', 'bst90p23_y')
bst90p23_y  High  Low  Medium
1              1    0       0
3              0    0       1
7              0    1       0
9              0    1       0
11             1    0       0
...          ...  ...     ...
6497           0    0       1
6498           1    0       0
6499           0    1       0
6501           1    0       0
6502           1    0       0

[3583 rows x 3 columns]
Got here
dummy is: ('household', 'bst90p13_y')
bst90p13_y  Married couple family household  Non-family household  \
1                                         1                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
