In [1]:
# ISA 530: Final Project
# Quantifying the Predictive Power of Biological vs. Environmental Features in Health Outcomes 
# Data Preprocessing
# Carly Carroll

### Import Libraries

In [2]:
import os
import pandas as pd
import pyreadstat
from typing import List
from IPython.display import display
import numpy as np

### Configure File Lists

In [3]:
# display all columns 
pd.set_option('display.max_columns', None)

# define base path
base_path = "/Users/carlycarroll/Desktop/MIDUS/Data"


# MIDUS 1 data
M1_files = [
    f"{base_path}/ICPSR_02760_M1/DS0001/02760-0001-Data.sav",
    f"{base_path}/ICPSR_02760_M1/DS0002/02760-0002-Data.sav",
]

# MIDUS 2 data
M2_files = [
    f"{base_path}/ICPSR_04652_M2/DS0001/04652-0001-Data.sav",
    f"{base_path}/ICPSR_04652_M2/DS0004/04652-0004-Data.sav",
]

# MIDUS 3 data 
M3_files = [
    f"{base_path}/ICPSR_36346_M3/DS0001/36346-0001-Data.sav",
]

# wave mapping 
wave_mapping = {
  "M1": {
    "M2ID": "M2ID",
    "M2FAMNUM": "M2FAMNUM",
    "SAMPLMAJ": "SAMPLMAJ",
    "ZYGCAT": "ZYGCAT",
    "BMI": "A1SBMI",
    "DEPRESS": "A1PA57",
    "SRH": "A1SA1",
    "HYPERTEN": "A1PA34",
    "DIABETES": "A1SA9X",
    "ASTHMA": "A1SA9A",
    "EDUC": "A1PB1",
    "HHINCOME": "A1SJ11",
    "SMOKING": "A1PA40",
    "ALCOHOL": "A1PA52",
    "EXERCISE": "A1PA20",
    "MARITAL": "A1SP6",
    "AGE": "QL2",
    "SEX": "WHO"
  },
  "M2": {
    "M2ID": "M2ID",
    "M2FAMNUM": "M2FAMNUM",
    "SAMPLMAJ": "SAMPLMAJ",
    "BMI": "B1SBMI",
    "DEPRESS": "B1PA60",
    "SRH": "B1SA1",
    "HYPERTEN": "B1PA24",
    "DIABETES": "B1SA11X",
    "ASTHMA": "B1SA11A",
    "EDUC": "B1PB1",
    "HHINCOME": "B1SG12",
    "SMOKING": "B1PA39",
    "ALCOHOL": "B1PA49",
    "EXERCISE": "B1SA34E",
    "MARITAL": "B1SL6",
    "AGE": "B1PA3",
    "SEX": "B1PD3"
  },
  "M3": {
    "M2ID": "M2ID",
    "M2FAMNUM": "M2FAMNUM",
    "SAMPLMAJ": "SAMPLMAJ",
    "BMI": "C1SBMI",
    "DEPRESS": "C1PA60",
    "SRH": "C1SA1",
    "HYPERTEN": "C1PA24",
    "DIABETES": "C1SA11X",
    "ASTHMA": "C1SA11A",
    "EDUC": "C1PB1",
    "HHINCOME": "C1PB16",
    "SMOKING": "C1PA39",
    "ALCOHOL": "C1PA49",
    "EXERCISE": "C1SA30E",
    "MARITAL": "C1SL7",
    "AGE": "C1PA3",
    "SEX": "C1PD3"
  }
}

### Define Column Names and Features

In [4]:
# define primary id columns across all three waves 
id_cols = ["M2ID", "M2FAMNUM", "SAMPLMAJ", "ZYGCAT"]

# define features for biological and environmental data 
features = [
    # outcomes
    "SRH", # self rated health 
    
    # biological
    "HYPERTEN",
    "DIABETES",
    "ASTHMA",
    "DEPRESS",
    "BMI",  
    
    # environmental / demographic
    "EDUC",
    "HHINCOME",
    "SMOKING",
    "ALCOHOL",
    "EXERCISE",
    "MARITAL",
    
    # other
    "AGE",
    "SEX",
]

# define columns to keep 
cols = id_cols + features

### Helper Functions to Create New Combined Dataset 

In [5]:
# define numeric diffs columns 
diff_numeric = [
    "BMI", "DEPRESS", "HYPERTEN", "DIABETES", "ASTHMA",
    "EDUC", "HHINCOME", "SMOKING", "ALCOHOL", "EXERCISE", "AGE"
]
# define categorical diffs columns
diff_categorical = ["MARITAL", "SEX"]

# read sav file, keep/rename only the needed columns for this wave 
def read_trim_and_rename(path, wave_map):
    
    # build the exact list we want to read 
    actual = {k: v for k, v in wave_map.items() if v}
    want = [actual[k] for k in cols if k in actual]

    if not want:
        return pd.DataFrame(columns = cols)

    # read only needed columns
    df, _ = pyreadstat.read_sav(path, usecols = want, apply_value_formats = False)

    # rename to defined names
    rename = {actual[k]: k for k in actual if actual[k] in df.columns}
    return df.rename(columns = rename)

# combine files within a wave, collapse duplicates, twins only
def build_wave(files, wave_tag, wave_map):
    parts = [read_trim_and_rename(p, wave_map) for p in files]
    parts = [p for p in parts if not p.empty]
    if not parts:
        return pd.DataFrame(columns = cols)

    big = pd.concat(parts, ignore_index = True)

    # collapse to one row per person
    big = (
        big.sort_values("M2ID")
           .groupby("M2ID", as_index = False)
           .agg(lambda s: s.dropna().iloc[0] if s.notna().any() else pd.NA)
    )

    big["WAVE"] = wave_tag
    
    # twins filter
    smj = pd.to_numeric(big["SAMPLMAJ"], errors = "coerce")
    big = big[smj == 3]

    # optional: drop rows with missing srh if srh exists in this wave
    srh_candidates = ["SELFHEALTH", "SELF_RATED_HEALTH", "SRH", "HEALTH"]
    srh_col = next((c for c in srh_candidates if c in big.columns), None)
    if srh_col is not None:
        big[srh_col] = pd.to_numeric(big[srh_col], errors = "coerce")
        big = big.dropna(subset = [srh_col])

    # keep only families with exactly two unique persons in this wave
    big = (
        big.groupby("M2FAMNUM", group_keys = False)
           .filter(lambda g: g["M2ID"].nunique() == 2)
    )

    return big

# append M1, M2, M3 to all columns except M2ID
def suffix_wave(df, wave):
    return df.rename(columns={c: f"{c}_{wave}" for c in df.columns if c != "M2ID"})

# enforce twin pairs in wide (exactly two per family per wave, and no solo presence)
def enforce_pairs_wide(wide_df, id_col = "M2ID", waves = ("M1","M2","M3")):
    w = wide_df.copy()

    # for each wave, if a family appears, it must appear exactly twice
    keep_masks = []
    for wtag in waves:
        suf = f"_{wtag}"
        fam_col = f"M2FAMNUM{suf}"
        if fam_col not in w.columns:
            continue

        # detect srh column for this wave (if present), e.g., SELFHEALTH_M1
        srh_candidates = [f"{base}{suf}" for base in ["SELFHEALTH", "SELF_RATED_HEALTH", "SRH", "HEALTH"]]
        srh_col = next((c for c in srh_candidates if c in w.columns), None)

        # columns that define "presence" for this wave
        present_cols = [c for c in w.columns if c.endswith(suf) and c != fam_col]
        # if srh exists, require srh non-missing as part of presence
        if srh_col is not None and srh_col not in present_cols:
            present_cols.append(srh_col)

        # row has any data for this wave (excluding id), and srh if available
        present = w[present_cols].notna().any(axis = 1) if present_cols else w[fam_col].notna()

        # subset to present rows to compute per-family person counts
        tmp = w.loc[present, [id_col, fam_col]].copy()
        if not tmp.empty:
            counts = tmp.groupby(fam_col, dropna = False)[id_col].transform("nunique")
            ok = pd.Series(False, index = w.index)
            ok.loc[present] = counts.eq(2)
        else:
            ok = pd.Series(True, index = w.index)  # nothing present ⇒ pass

        # families with zero presence are fine; rows without presence pass
        keep_masks.append(ok | (~present))

    # if no wave-level family columns exist, return as-is
    if not keep_masks:
        return w

    # keep only rows that satisfy all wave-level twin constraints
    keep_all = keep_masks[0]
    for m in keep_masks[1:]:
        keep_all = keep_all & m

    w = w.loc[keep_all].copy()
    return w

# convert M1, M2, M3 wide table into long format
def wide_to_long(wide_df, id_col = "M2ID", waves = ("M1","M2","M3")):
    frames = []
    for w in waves:
        suf = f"_{w}"
        wcols = [c for c in wide_df.columns if c.endswith(suf)]
        if not wcols:
            continue

        tmp = wide_df[[id_col] + wcols].copy()
        
        # strip suffixes 
        tmp = tmp.rename(columns = {c: c[:-len(suf)] for c in wcols})
        tmp["wave"] = w

        # ensure pair identifiers are kept in long
        keep = [id_col, "wave"]
        if "M2FAMNUM" in tmp.columns:
            keep.append("M2FAMNUM")
        if "SAMPLMAJ" in tmp.columns:
            keep.append("SAMPLMAJ")
        if "ZYGCAT" in tmp.columns: 
            keep.append("ZYGCAT")

        # keep only features that actually exist in this wave
        keep += [c for c in features if c in tmp.columns]

        tmp = tmp[keep]

        # drop missing srh if srh exists in this wave (after suffix removal)
        srh_candidates = ["SELFHEALTH", "SELF_RATED_HEALTH", "SRH", "HEALTH"]
        srh_col = next((c for c in srh_candidates if c in tmp.columns), None)
        if srh_col is not None:
            tmp[srh_col] = pd.to_numeric(tmp[srh_col], errors = "coerce")
            tmp = tmp.dropna(subset = [srh_col])

        # keep only twin sample and exactly two per family per wave
        if "SAMPLMAJ" in tmp.columns:
            smj = pd.to_numeric(tmp["SAMPLMAJ"], errors = "coerce")
            tmp = tmp[smj == 3]

        if "M2FAMNUM" in tmp.columns:
            tmp = (
                tmp.groupby(["M2FAMNUM", "wave"], group_keys = False)
                   .filter(lambda g: g[id_col].nunique() == 2)
            )

        frames.append(tmp)

    if not frames:
        return pd.DataFrame(columns = [id_col, "wave", "M2FAMNUM", "SAMPLMAJ", "ZYGCAT"] + features)

    long_df = pd.concat(frames, ignore_index=True)
    return long_df

# compute twinB - twinA for numeric vars
def within_pair_diffs(long_df, pair_col="M2FAMNUM", id_col="M2ID",
                      numeric_list=None, categorical_list=None):
    if numeric_list is None:
        numeric_list = diff_numeric
    if categorical_list is None:
        categorical_list = diff_categorical

    df = long_df.copy()

    # fail early if keys missing
    for req in (pair_col, "wave", id_col):
        if req not in df.columns:
            raise KeyError(f"Column '{req}' not found in long panel")

    # keep only rows with pair label and wave
    df = df.dropna(subset=[pair_col, "wave"])

    # ensure we only work with available columns
    num_present = [c for c in numeric_list if c in df.columns]
    cat_present = [c for c in categorical_list if c in df.columns]

    # cast numeric to numeric
    for c in num_present:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # keep only exact twin pairs (2 per family per wave)
    df = df.groupby([pair_col, "wave"], group_keys=False).filter(lambda g: g[id_col].nunique() == 2)

    # deterministic ordering before subtraction/comparison
    df = df.sort_values([pair_col, "wave", id_col]).reset_index(drop=True)

    # try to find self-rated health column
    srh_candidates = ["SELFHEALTH", "SELF_RATED_HEALTH", "SRH", "HEALTH"]
    srh_col = next((c for c in srh_candidates if c in df.columns), None)
    if srh_col is not None:
        df[srh_col] = pd.to_numeric(df[srh_col], errors="coerce")

    # compute diffs
    rows = []
    for (fam, w), g in df.groupby([pair_col, "wave"]):
        a, b = g.iloc[0], g.iloc[1]
        row = {pair_col: fam, "wave": w}

        # numeric diffs: twinB - twinA
        for c in num_present:
            av, bv = a[c], b[c]
            row[f"d_{c}"] = (bv - av) if pd.notna(av) and pd.notna(bv) else np.nan

        # categorical mismatches: 1 if different, 0 if same, NaN if missing
        for c in cat_present:
            av, bv = a[c], b[c]
            if pd.isna(av) or pd.isna(bv):
                row[f"d_{c}"] = np.nan
            else:
                row[f"d_{c}"] = 0 if av == bv else 1

        # numeric SRH diff (only if SRH exists)
        if srh_col is not None:
            av, bv = a[srh_col], b[srh_col]
            d_srh = (bv - av) if pd.notna(av) and pd.notna(bv) else np.nan
            row["d_SRH"] = d_srh

        rows.append(row)

    out = pd.DataFrame(rows)

    # keep zygosity if present
    if "ZYGCAT" in df.columns:
        zygo = (
            df.groupby([pair_col, "wave"], as_index=False)["ZYGCAT"]
              .agg(lambda s: s.dropna().iloc[0] if s.notna().any() else pd.NA)
        )
        out = out.merge(zygo, on=[pair_col, "wave"], how="left")

    # remove rows where SRH is NaN
    if "d_SELFHEALTH" in out.columns:
        out = out.dropna(subset=["d_SRH"])

    return out

### Build Datasets

In [6]:
M1 = build_wave(M1_files, "M1", wave_mapping["M1"])
M2 = build_wave(M2_files, "M2", wave_mapping["M2"])
M3 = build_wave(M3_files, "M3", wave_mapping["M3"])

# rename column headers for each dataset 

# wide
wide = suffix_wave(M1,"M1").merge(suffix_wave(M2,"M2"), on = "M2ID", how = "outer") \
                           .merge(suffix_wave(M3,"M3"), on = "M2ID", how = "outer")

# long
long = wide_to_long(wide, id_col = "M2ID", waves = ("M1","M2","M3"))
long = long.groupby(["wave", "M2FAMNUM"]).filter(lambda g: len(g) == 2)

# differences
diffs = within_pair_diffs(long, pair_col = "M2FAMNUM", id_col = "M2ID",
                          numeric_list = diff_numeric, categorical_list = diff_categorical)

print("Shapes — WIDE:", wide.shape, "LONG:", long.shape, "DIFFS:", diffs.shape)

Shapes — WIDE: (1584, 53) LONG: (3026, 19) DIFFS: (1513, 17)


In [7]:
# how many twin pairs are in m1, m2, m3, and how many individual records

waves = ("M1","M2","M3")
req = ["M2FAMNUM", "M2ID", "wave"]
for c in req:
    if c not in long.columns:
        raise KeyError(f"long_df must have column: {c}")

rows = []
for w in waves:
    sub = long[long["wave"] == w].copy()
    # count families with exactly two unique ids
    twin_pairs = (
        sub.groupby("M2FAMNUM")["M2ID"]
           .nunique()
           .eq(2)
           .sum()
    )
    # unique individuals and total rows for this wave
    uniq_individuals = sub["M2ID"].nunique()
    total_rows = len(sub)
    rows.append({"wave": w,
                 "twin_pairs": int(twin_pairs),
                 "unique_individuals": int(uniq_individuals),
                 "total_rows": int(total_rows)})

summary_counts = pd.DataFrame(rows, columns=["wave","twin_pairs","unique_individuals","total_rows"])
print(summary_counts)

  wave  twin_pairs  unique_individuals  total_rows
0   M1         758                1516        1516
1   M2         480                 960         960
2   M3         275                 550         550


### Twin Pairs and Unique Individuals Counts

### Convert Datasets to CSV

In [8]:
display(wide.head(10))
display(wide.shape)
display(wide.columns)

display(long.head(10))
display(long.shape)
display(long.columns)

display(diffs.head(10))
display(diffs.shape)
display(diffs.columns)

Unnamed: 0,M2ID,M2FAMNUM_M1,SAMPLMAJ_M1,AGE_M1,SEX_M1,ZYGCAT_M1,EXERCISE_M1,HYPERTEN_M1,SMOKING_M1,ALCOHOL_M1,DEPRESS_M1,EDUC_M1,SRH_M1,ASTHMA_M1,DIABETES_M1,BMI_M1,HHINCOME_M1,MARITAL_M1,WAVE_M1,M2FAMNUM_M2,SAMPLMAJ_M2,AGE_M2,HYPERTEN_M2,SMOKING_M2,ALCOHOL_M2,DEPRESS_M2,EDUC_M2,SEX_M2,SRH_M2,ASTHMA_M2,DIABETES_M2,EXERCISE_M2,BMI_M2,HHINCOME_M2,MARITAL_M2,WAVE_M2,M2FAMNUM_M3,SAMPLMAJ_M3,AGE_M3,HYPERTEN_M3,SMOKING_M3,ALCOHOL_M3,DEPRESS_M3,EDUC_M3,HHINCOME_M3,SEX_M3,SRH_M3,ASTHMA_M3,DIABETES_M3,EXERCISE_M3,BMI_M3,MARITAL_M3,WAVE_M3
0,10005.0,120803.0,3.0,,,3.0,1.0,1.0,2.0,32.0,2.0,5.0,9.0,2.0,2.0,26.507816,2.0,,M1,120803.0,3.0,2.0,1.0,,35.0,2.0,3.0,,8.0,2.0,2.0,2.0,,0.0,,M2,,,,,,,,,,,,,,,,,
1,10006.0,120772.0,3.0,,,2.0,2.0,2.0,2.0,21.0,1.0,6.0,8.0,2.0,2.0,26.891987,2.0,,M1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10010.0,120378.0,3.0,,,3.0,2.0,2.0,1.0,18.0,1.0,3.0,6.0,2.0,2.0,23.056296,2.0,3.0,M1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10015.0,120805.0,3.0,,,1.0,2.0,2.0,1.0,19.0,1.0,5.0,7.0,2.0,2.0,31.601253,,3.0,M1,120805.0,3.0,2.0,1.0,2.0,20.0,1.0,5.0,,7.0,2.0,2.0,2.0,32.121429,0.0,3.0,M2,120805.0,3.0,3.0,1.0,2.0,20.0,1.0,5.0,,,6.0,2.0,2.0,2.0,35.900421,3.0,M3
4,10030.0,120243.0,3.0,,,1.0,2.0,2.0,2.0,24.0,2.0,8.0,8.0,2.0,2.0,25.584351,2.0,1.0,M1,120243.0,3.0,1.0,1.0,,24.0,1.0,6.0,,7.0,2.0,2.0,1.0,27.342055,0.0,1.0,M2,120243.0,3.0,2.0,1.0,,25.0,2.0,8.0,25000.0,,9.0,2.0,2.0,6.0,28.318557,1.0,M3
5,10036.0,120944.0,3.0,,,1.0,2.0,2.0,,22.0,2.0,12.0,7.0,2.0,2.0,25.843152,2.0,3.0,M1,120944.0,3.0,1.0,2.0,,24.0,2.0,12.0,,10.0,2.0,2.0,1.0,31.229474,0.0,1.0,M2,120944.0,3.0,1.0,2.0,,24.0,2.0,12.0,300000.0,,8.0,2.0,2.0,2.0,29.535031,3.0,M3
6,10046.0,120728.0,3.0,,,2.0,2.0,1.0,1.0,16.0,1.0,4.0,8.0,2.0,2.0,32.449911,2.0,1.0,M1,120728.0,3.0,4.0,1.0,,16.0,2.0,6.0,,5.0,2.0,1.0,2.0,32.281056,0.0,2.0,M2,120728.0,3.0,2.0,1.0,,17.0,2.0,5.0,8000.0,,8.0,2.0,1.0,1.0,33.480067,1.0,M3
7,10063.0,120288.0,3.0,,,2.0,2.0,2.0,2.0,16.0,2.0,11.0,10.0,2.0,2.0,20.902899,2.0,,M1,120288.0,3.0,2.0,2.0,,15.0,2.0,11.0,,10.0,2.0,2.0,4.0,21.257185,0.0,3.0,M2,120288.0,3.0,1.0,2.0,,14.0,2.0,11.0,175000.0,,9.0,2.0,2.0,1.0,22.142901,3.0,M3
8,10067.0,120085.0,3.0,,,1.0,2.0,1.0,,,2.0,5.0,8.0,2.0,2.0,27.923114,2.0,2.0,M1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,10087.0,120731.0,3.0,,,1.0,2.0,2.0,,16.0,2.0,9.0,8.0,2.0,2.0,20.279976,2.0,1.0,M1,120731.0,3.0,1.0,2.0,,21.0,2.0,9.0,,8.0,2.0,2.0,7.0,21.124975,0.0,2.0,M2,120731.0,3.0,1.0,2.0,,21.0,2.0,9.0,30000.0,,8.0,2.0,2.0,2.0,22.657897,2.0,M3


(1584, 53)

Index(['M2ID', 'M2FAMNUM_M1', 'SAMPLMAJ_M1', 'AGE_M1', 'SEX_M1', 'ZYGCAT_M1',
       'EXERCISE_M1', 'HYPERTEN_M1', 'SMOKING_M1', 'ALCOHOL_M1', 'DEPRESS_M1',
       'EDUC_M1', 'SRH_M1', 'ASTHMA_M1', 'DIABETES_M1', 'BMI_M1',
       'HHINCOME_M1', 'MARITAL_M1', 'WAVE_M1', 'M2FAMNUM_M2', 'SAMPLMAJ_M2',
       'AGE_M2', 'HYPERTEN_M2', 'SMOKING_M2', 'ALCOHOL_M2', 'DEPRESS_M2',
       'EDUC_M2', 'SEX_M2', 'SRH_M2', 'ASTHMA_M2', 'DIABETES_M2',
       'EXERCISE_M2', 'BMI_M2', 'HHINCOME_M2', 'MARITAL_M2', 'WAVE_M2',
       'M2FAMNUM_M3', 'SAMPLMAJ_M3', 'AGE_M3', 'HYPERTEN_M3', 'SMOKING_M3',
       'ALCOHOL_M3', 'DEPRESS_M3', 'EDUC_M3', 'HHINCOME_M3', 'SEX_M3',
       'SRH_M3', 'ASTHMA_M3', 'DIABETES_M3', 'EXERCISE_M3', 'BMI_M3',
       'MARITAL_M3', 'WAVE_M3'],
      dtype='object')

Unnamed: 0,M2ID,wave,M2FAMNUM,SAMPLMAJ,ZYGCAT,SRH,HYPERTEN,DIABETES,ASTHMA,DEPRESS,BMI,EDUC,HHINCOME,SMOKING,ALCOHOL,EXERCISE,MARITAL,AGE,SEX
0,10005.0,M1,120803.0,3.0,3.0,9.0,1.0,2.0,2.0,2.0,26.507816,5.0,2.0,2.0,32.0,1.0,,,
1,10006.0,M1,120772.0,3.0,2.0,8.0,2.0,2.0,2.0,1.0,26.891987,6.0,2.0,2.0,21.0,2.0,,,
2,10010.0,M1,120378.0,3.0,3.0,6.0,2.0,2.0,2.0,1.0,23.056296,3.0,2.0,1.0,18.0,2.0,3.0,,
3,10015.0,M1,120805.0,3.0,1.0,7.0,2.0,2.0,2.0,1.0,31.601253,5.0,,1.0,19.0,2.0,3.0,,
4,10030.0,M1,120243.0,3.0,1.0,8.0,2.0,2.0,2.0,2.0,25.584351,8.0,2.0,2.0,24.0,2.0,1.0,,
5,10036.0,M1,120944.0,3.0,1.0,7.0,2.0,2.0,2.0,2.0,25.843152,12.0,2.0,,22.0,2.0,3.0,,
6,10046.0,M1,120728.0,3.0,2.0,8.0,1.0,2.0,2.0,1.0,32.449911,4.0,2.0,1.0,16.0,2.0,1.0,,
7,10063.0,M1,120288.0,3.0,2.0,10.0,2.0,2.0,2.0,2.0,20.902899,11.0,2.0,2.0,16.0,2.0,,,
8,10067.0,M1,120085.0,3.0,1.0,8.0,1.0,2.0,2.0,2.0,27.923114,5.0,2.0,,,2.0,2.0,,
9,10087.0,M1,120731.0,3.0,1.0,8.0,2.0,2.0,2.0,2.0,20.279976,9.0,2.0,,16.0,2.0,1.0,,


(3026, 19)

Index(['M2ID', 'wave', 'M2FAMNUM', 'SAMPLMAJ', 'ZYGCAT', 'SRH', 'HYPERTEN',
       'DIABETES', 'ASTHMA', 'DEPRESS', 'BMI', 'EDUC', 'HHINCOME', 'SMOKING',
       'ALCOHOL', 'EXERCISE', 'MARITAL', 'AGE', 'SEX'],
      dtype='object')

Unnamed: 0,M2FAMNUM,wave,d_BMI,d_DEPRESS,d_HYPERTEN,d_DIABETES,d_ASTHMA,d_EDUC,d_HHINCOME,d_SMOKING,d_ALCOHOL,d_EXERCISE,d_AGE,d_MARITAL,d_SEX,d_SRH,ZYGCAT
0,120002.0,M1,-6.799214,1.0,0.0,0.0,0.0,3.0,0.0,,-18.0,0.0,,1.0,,3.0,3.0
1,120002.0,M2,-4.734714,1.0,1.0,1.0,1.0,3.0,0.0,,-33.0,0.0,-1.0,1.0,,4.0,
2,120003.0,M1,7.966034,-1.0,1.0,0.0,0.0,-2.0,0.0,,0.0,0.0,,0.0,,1.0,3.0
3,120003.0,M2,,-1.0,0.0,0.0,0.0,-2.0,0.0,,0.0,-3.0,2.0,1.0,,-1.0,
4,120003.0,M3,12.781299,0.0,0.0,0.0,0.0,-2.0,20000.0,,0.0,1.0,1.0,1.0,,1.0,
5,120005.0,M1,-4.59942,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,-1.0,0.0,,1.0,,2.0,2.0
6,120006.0,M1,-4.304019,1.0,0.0,0.0,0.0,0.0,0.0,,4.0,0.0,,1.0,,-2.0,1.0
7,120007.0,M1,-4.035132,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,3.0,1.0
8,120007.0,M2,-7.126642,1.0,0.0,1.0,0.0,0.0,,,8.0,1.0,0.0,,,0.0,
9,120008.0,M1,3.979481,1.0,0.0,0.0,0.0,-3.0,8.0,1.0,-2.0,0.0,,0.0,,-2.0,1.0


(1513, 17)

Index(['M2FAMNUM', 'wave', 'd_BMI', 'd_DEPRESS', 'd_HYPERTEN', 'd_DIABETES',
       'd_ASTHMA', 'd_EDUC', 'd_HHINCOME', 'd_SMOKING', 'd_ALCOHOL',
       'd_EXERCISE', 'd_AGE', 'd_MARITAL', 'd_SEX', 'd_SRH', 'ZYGCAT'],
      dtype='object')

In [9]:
wide.to_csv(f"{base_path}/MIDUS_twins_wide.csv", index = False)
long.to_csv(f"{base_path}/MIDUS_twins_long.csv", index = False)
diffs.to_csv(f"{base_path}/MIDUS_twins_diffs.csv", index = False)