**Author:** Revekka Gersgovich

**Purpose:** Explore the data with children's mental health stigma or adult mental health stigma modules

**Date:** Nov 29, 2025

In [None]:
import os
import os.path as path
import pandas as pd
import numpy as np
import glob
import narwhals
import pyreadstat
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Documents/Filling_system/Academic/Taste-Based_Discrimination") # Change this directory to run from your computer
assert os.path.exists(parent_dir), "parent_dir does not exist"
os.chdir(parent_dir)

raw_data_dir = path.join(parent_dir, "1_data", "1_raw")
assert os.path.exists(raw_data_dir), "raw_data_dir does not exist"

intermediate_data_dir = path.join(parent_dir, "1_data", "2_intermediate")
assert os.path.exists(intermediate_data_dir), "intermediate_data_dir does not exist"

# Loading Data

In [None]:
df = pd.read_csv(path.join(intermediate_data_dir, "gss_cleaned_96_02_06_18_24.csv"))

In [None]:
df.shape

# Filtering Data

In [None]:
df['upsdowns'].value_counts()

In [None]:
# Dropping respondents who did not receive either childhood mental health stigma module or adult mental health stigma module
df["vignette_missing"] = (
    df["vigversn"].isna()
).astype(int)

print(df['vignette_missing'].value_counts())

In [None]:
# Dropping respondents who did not receive either childhood mental health stigma module or adult mental health stigma module
df["vignette_missing"] = (
    df["vigversn"].isna() & df["chldvig"].isna()
).astype(int)

print(df['vignette_missing'].value_counts())

In [None]:
# 2. Calculate share missing by year
miss = df.groupby("year")["vignette_missing"].mean()

# 3. Plot
miss.plot(kind="bar")
plt.ylabel("Share missing")
plt.title("Share of respondents with vignette_missing by year")
plt.ylim(0, 1)
plt.show()

In [None]:

df = df[df['vignette_missing'] == 0].copy()
df.drop(columns=['vignette_missing'], inplace=True)
print(df.shape)

In [None]:
df['year'].value_counts()

In [None]:
mh_vars = [
    "evbrkdwn",   # ever felt like having a nervous breakdown
    "relmhsp1",   # patient was self (mental health help-seeking)
    "evmhp",      # ever had a mental health problem
    "mntlhlth",   # days of poor mental health, past 30 days
    "depress",    # ever told by a doctor you had depression
    "diagnosd",   # ever diagnosed with mental health problem
    "mhtreatd",   # ever treated for mental health problem
    "emoprobs"    # emotional problems interfering with life
]

In [None]:
df["mh_all_missing"] = df[mh_vars].isna().all(axis=1).astype(int)

print(df['mh_all_missing'].value_counts())

# Creating a unified measure of respondents' mental health

## Converting continuous variables to dummies

In [None]:
# df["mntlhlth_dummy_cdc"] = (df["mntlhlth"] >= 14).astype(int)

# df["mntlhlth_dummy_cdc"] = df["mntlhlth_dummy_cdc"].replace({0: 2})

df["mntlhlth_dummy"] = (df["mntlhlth"] >= 4).astype(int)

df["mntlhlth_dummy"] = df["mntlhlth_dummy"].replace({0: 2})

df[~df['mntlhlth'].isna()][['mntlhlth', 'mntlhlth_dummy']].head()

# Part of CDC healthy days framework



In [None]:
# assuming the responses are coded 1–5
df["emoprobs_dummy"] = df["emoprobs"].isin([4, 5]).astype(int)

df["emoprobs_dummy"] = df["emoprobs_dummy"].replace({0: 2})

# Often and Always are coded as 1

df[~df['emoprobs'].isna()][['emoprobs', 'emoprobs_dummy']].head()

## Exploring which own respondent mental health measures are available for which years

In [None]:
mh_vars = [
    "evbrkdwn",   # ever felt like having a nervous breakdown - 1 "yes", 2 "no"
    "relmhsp1",   # patient was self (mental health help-seeking) - 1 "self", 2 "no"
    "evmhp",      # ever had a mental health problem - 1 "yes", 2 "no"
    "mntlhlth_dummy",   # days of poor mental health, past 30 days more than 14 - 1 "yes", 2 "no"
    "depress",    # ever told by a doctor you had depression - 1 "yes", 2 "no"
    "diagnosd",   # ever diagnosed with mental health problem - 1 "yes", 2 "no"
    "mhtreatd",   # ever treated for mental health problem - 1 "yes", 2 "no"
    "emoprobs_dummy"    # emotional problems interfering with life - 1 "yes", 2 "no"
]

In [None]:
years = sorted(df['year'].unique())
rows = []

for var in mh_vars:
    row = {"variable": var}
    for yr in years:
        # Check if the variable has any non-missing values for that year
        present = df.loc[df['year'] == yr, var].eq(1).any()
        row[str(yr)] = 1 if present else 0
    rows.append(row)

mh_presence = pd.DataFrame(rows)
print(mh_presence)

In [None]:
mh_vars_2018 = [
    "mntlhlth_dummy",
    "diagnosd",
    "mhtreatd"
]

# 1. Filter 2018 only
df18 = df[df["year"] == 2018]

# 2. Compute % == 1 for each variable
mh_stats = (
    df18[mh_vars_2018]
    .eq(1)
    .mean()
    .mul(100)
    .sort_values()
)

print(mh_stats)

# 3. Plot
plt.figure(figsize=(8, 4))
plt.barh(mh_stats.index, mh_stats.values)
plt.xlabel("Percent of respondents with indicator = 1")
plt.title("Prevalence of Mental-Health Measures in 2018")
plt.tight_layout()
plt.show()

## Plotting simple graph of all mental health measures over the years

In [None]:
# # Convert the mh_vars to numeric just to be safe
df[mh_vars] = df[mh_vars].apply(pd.to_numeric, errors='coerce')

# Dummy = 1 if any variable == 1
df["mh_any_problem"] = df[mh_vars].eq(1).any(axis=1).map({True: 1, False: 2})

In [None]:
mh_by_year = (
    df.groupby("year")["mh_any_problem"]
      .apply(lambda s: (s == 1).mean() * 100)   # percentage
      .reset_index(name="pct_mh_any_problem")
)

print(mh_by_year)

# 2. Plot
plt.figure()
plt.bar(mh_by_year["year"].astype(str), mh_by_year["pct_mh_any_problem"])
plt.xlabel("Year")
plt.ylabel("Percent with any mental health problem (mh_any_problem == 1)")
plt.title("Share of respondents with any mental health problem, by year")
plt.tight_layout()
plt.show()

# Filtering to two datasets

## Data for 2002, 2006, and 2018

In [None]:
df02_06_18 = df[df['year'].isin([2002, 2006, 2018])].copy()
df02_06_18.shape

In [None]:
df02_06_18.drop(columns=['evbrkdwn', 'relmhsp1', 'evmhp', 'depress', 'diagnosd', 'mhtreatd','emoprobs', 'emoprobs_dummy'], inplace=True)

In [None]:
df02_06_18[~df02_06_18['mntlhlth_dummy'].isna()]
print(df02_06_18.shape)

## Data for 2018, and 2024

In [None]:
df18_24 = df[df['year'].isin([2018, 2024])].copy()
df18_24.shape

In [None]:
df18_24.drop(columns=['evbrkdwn', 'relmhsp1', 'evmhp', 'depress', 'mntlhlth', 'mntlhlth_dummy', 'mhtreatd', 'emoprobs_dummy'], inplace=True)

In [None]:
df18_24[~df18_24['diagnosd'].isna()]
print(df18_24.shape)

# Graph of Prevalence of Mental Health Disorders by Region and Year

In [None]:
df02_06_18['mntlhlth_dummy'] = df02_06_18['mntlhlth_dummy'].replace({2:0, 1:1})
df02_06_18['mntlhlth_dummy'] = df02_06_18['mntlhlth_dummy'].astype(float)

In [None]:
# Group by region and year, compute mean of the dummy = prevalence
prev = (
    df02_06_18
    .groupby(['region', 'year'])['mntlhlth_dummy']
    .mean()
    .reset_index()
)

# Convert to pivot table for easier plotting
table = prev.pivot(index='region', columns='year', values='mntlhlth_dummy')

# Sort columns for consistent order
table = table[[2002, 2006, 2018]]

In [None]:
# Prevalence of Mental-Health Problems by Year and Region (clean/minimal + both axis lines)

fig, ax = plt.subplots(figsize=(8,5))

years = table.columns          # 2002, 2006, 2018
regions = table.index          # 1,2,3,4
x = np.arange(len(years))
width = 0.20

# Minimal theme: keep LEFT and BOTTOM axis lines, remove TOP and RIGHT
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)

# Plot bars
for i, reg in enumerate(regions):
    ax.bar(
        x + i*width,
        table.loc[reg] * 100,
        width,
        label={1:'Northeast', 2:'Midwest', 3:'South', 4:'West'}[reg],
        alpha=0.9
    )

# Labels
ax.set_xlabel('Year', fontsize=11)
ax.set_ylabel('Prevalence (%)', fontsize=11)
ax.set_title('Mental-Health Problem Prevalence by Year and Region', fontsize=13)

# X-axis ticks
ax.set_xticks(x + width*1.5)
ax.set_xticklabels(years)

# No gridlines
ax.grid(False)

# Legend
ax.legend(title='Region', frameon=False)

plt.tight_layout()
plt.show()

In [None]:
df18_24['diagnosd'] = df18_24['diagnosd'].replace({2:0})

# Group by region and year, compute mean of the dummy = prevalence
prev = (
    df18_24
    .groupby(['region', 'year'])['diagnosd']
    .mean()
    .reset_index()
)

# Convert to pivot table for easier plotting
table = prev.pivot(index='region', columns='year', values='diagnosd')

# Sort columns for consistent order
table = table[[2018, 2024]]

In [None]:
# Prevalence of Mental-Health Problems by Year and Region (clean/minimal + both axis lines)

fig, ax = plt.subplots(figsize=(8,5))

years = table.columns          # 2002, 2006, 2018
regions = table.index          # 1,2,3,4
x = np.arange(len(years))
width = 0.20

# Minimal theme: keep LEFT and BOTTOM axis lines, remove TOP and RIGHT
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)

# Plot bars
for i, reg in enumerate(regions):
    ax.bar(
        x + i*width,
        table.loc[reg] * 100,
        width,
        label={1:'Northeast', 2:'Midwest', 3:'South', 4:'West'}[reg],
        alpha=0.9
    )

# Labels
ax.set_xlabel('Year', fontsize=11)
ax.set_ylabel('Prevalence (%)', fontsize=11)
ax.set_title('Mental-Health Problem Prevalence by Year and Region', fontsize=13)

# X-axis ticks
ax.set_xticks(x + width*1.5)
ax.set_xticklabels(years)

# No gridlines
ax.grid(False)

# Legend – move it outside the plot, upper right
ax.legend(
    title='Region',
    frameon=False,
    loc='upper left'
)

plt.tight_layout()
plt.show()