**Author:** Revekka Gersgovich

**Purpose:** Clean and merge the GSS Data

**Date:** Nov 29, 2025

In [None]:
import os
import os.path as path
import pandas as pd
import numpy as np
import glob
import narwhals
import pyreadstat

In [None]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Documents/Filling_system/Academic/Taste-Based_Discrimination") # Change this directory to run from your computer
assert os.path.exists(parent_dir), "parent_dir does not exist"
os.chdir(parent_dir)

raw_data_dir = path.join(parent_dir, "1_data", "1_raw")
assert os.path.exists(raw_data_dir), "raw_data_dir does not exist"

intermediate_data_dir = path.join(parent_dir, "1_data", "2_intermediate")
assert os.path.exists(intermediate_data_dir), "intermediate_data_dir does not exist"

# Loading Datasets & Saving Metadata

In [None]:
# 1996
df_1996, meta_1996 = pyreadstat.read_dta(
    os.path.join(raw_data_dir, "GSS1996.dta"),
    apply_value_formats=False  # keep numeric codes, don't turn into labels
)

# 2006
df_2006, meta_2006 = pyreadstat.read_dta(
    os.path.join(raw_data_dir, "GSS2006.dta"),
    apply_value_formats=False
)

# 2018
df_2018, meta_2018 = pyreadstat.read_dta(
    os.path.join(raw_data_dir, "GSS2018.dta"),
    apply_value_formats=False
)

# 2024 (inside subfolder)
df_2024, meta_2024 = pyreadstat.read_dta(
    os.path.join(raw_data_dir, "2024", "GSS2024.dta"),
    apply_value_formats=False
)

In [None]:
def save_metadata_to_csv(meta, year):
    labels_df = pd.DataFrame({
        "variable": meta.column_names,
        "label": meta.column_labels
    })
    labels_df.to_csv(os.path.join(raw_data_dir, f"GSS_{year}_variable_labels.csv"), index=False)

save_metadata_to_csv(meta_1996, 1996)
save_metadata_to_csv(meta_2006, 2006)
save_metadata_to_csv(meta_2018, 2018)
save_metadata_to_csv(meta_2024, 2024)

In [None]:
df_1996.shape, df_2006.shape, df_2018.shape, df_2024.shape

# Merging Datasets

## Check variable consistency across years

In [None]:
set_1996 = set(df_1996.columns)
set_2006 = set(df_2006.columns)
set_2018 = set(df_2018.columns)
set_2024 = set(df_2024.columns)

datasets = {
    "1996": set_1996,
    "2006": set_2006,
    "2018": set_2018,
    "2024": set_2024
}

In [None]:
# Variables present in ALL datasets
common_vars = set.intersection(*datasets.values())

print(f"üîç Total variables per dataset:")
for year, var_set in datasets.items():
    print(f"  ‚Ä¢ {year}: {len(var_set):,}")

print(f"\nüìä Variables present in ALL datasets: {len(common_vars):,}")

print(f"\nüß™ Presence of common variables in each dataset:")
for year, var_set in datasets.items():
    print(f"  ‚Ä¢ {year}: {len(common_vars & var_set):,} / {len(common_vars):,}")

for year, var_set in datasets.items():
    missing = common_vars - var_set
    assert len(missing) == 0, f"‚ùå Dataset {year} is missing {len(missing)} common variables!"
    print(f"‚úÖ {year}: all common variables present")

## Subsetting datasets to common years and concatinate

In [None]:
df_1996 = df_1996[list(common_vars)].copy()
df_2006 = df_2006[list(common_vars)].copy()
df_2018 = df_2018[list(common_vars)].copy()
df_2024 = df_2024[list(common_vars)].copy()

In [None]:
df = pd.concat([df_1996, df_2006, df_2018, df_2024], ignore_index=True)

# Validate

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['year'].value_counts()

In [None]:
df.sample(5, random_state=42)

In [None]:
df.columns

# Saving Datasets and Metadata

In [None]:
# Use 1996 metadata as reference (change if preferred)
meta_ref = meta_1996

# Create original codebook (full)
codebook_full = pd.DataFrame({
    "variable": meta_ref.column_names,
    "label": meta_ref.column_labels
})

# Filter to only variables retained in final df
codebook = codebook_full[codebook_full["variable"].isin(df.columns)].copy()

# (Optional) Sort alphabetically for sanity
codebook = codebook.sort_values(by="variable").reset_index(drop=True)

# Save
codebook.to_csv(os.path.join(intermediate_data_dir, "codebook.csv"), index=False)

In [None]:
df.to_csv(os.path.join(intermediate_data_dir, "df_96_06_18_24.csv"), index=False)