# Demographic Inference Analysis

Analysis of LLM-inferred demographics from YouTube commenter profiles.

This notebook examines:
- Distribution of inferred age ranges
- Distribution of inferred gender
- Distribution of inferred race/ethnicity
- Confidence level distributions
- Inference quality ("unclear" rates)
- Reasoning patterns

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import glob
import os

print("Libraries loaded successfully")

## Load Data

In [None]:
# Load the most recent user demographics CSV
csv_files = glob.glob("data/user_demographics_*.csv")
if not csv_files:
	raise FileNotFoundError("No user demographics CSV files found in data/ directory")

demographics_file = max(csv_files, key=os.path.getmtime)
print(f"Loading: {demographics_file}")

df = pd.read_csv(demographics_file)
print(f"\nLoaded {len(df)} user profiles with demographic inferences")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Display first few rows
df.head()

In [None]:
# Basic statistics
print("Dataset Overview:")
print(f"Total users analyzed: {len(df)}")
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nConfidence level statistics:")
print(df['confidence_level'].describe())

## Age Distribution

In [None]:
# Age range distribution
age_counts = df['inferred_age_range'].value_counts().sort_index()
print("Age Range Distribution:")
print(age_counts)
print(f"\nPercentages:")
print((age_counts / len(df) * 100).round(2))

In [None]:
# Plot age distribution
age_order = ['under_18', '18-24', '25-34', '35-44', '45-54', '55-64', '65_plus', 'unclear']
age_data = df['inferred_age_range'].value_counts().reindex(age_order, fill_value=0)

fig = go.Figure(data=[
	go.Bar(
		x=age_data.index,
		y=age_data.values,
		text=age_data.values,
		textposition='outside',
		marker_color='steelblue'
	)
])

fig.update_layout(
	title='Distribution of Inferred Age Ranges',
	xaxis_title='Age Range',
	yaxis_title='Count',
	height=500
)

fig.show()

# Calculate unclear rate
unclear_rate = (df['inferred_age_range'] == 'unclear').sum() / len(df) * 100
print(f"\nAge 'unclear' rate: {unclear_rate:.2f}%")

## Gender Distribution

In [None]:
# Gender distribution
gender_counts = df['inferred_gender'].value_counts()
print("Gender Distribution:")
print(gender_counts)
print(f"\nPercentages:")
print((gender_counts / len(df) * 100).round(2))

In [None]:
# Plot gender distribution
gender_order = ['male', 'female', 'non_binary', 'unclear']
gender_data = df['inferred_gender'].value_counts().reindex(gender_order, fill_value=0)

# Create subplots
fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Distribution of Inferred Gender', 'Gender Distribution (Excluding "Unclear")'),
	specs=[[{'type': 'bar'}, {'type': 'pie'}]]
)

# Bar chart
fig.add_trace(
	go.Bar(
		x=gender_data.index,
		y=gender_data.values,
		text=gender_data.values,
		textposition='outside',
		marker_color='coral',
		name='Count'
	),
	row=1, col=1
)

# Pie chart (excluding unclear)
gender_clear = df[df['inferred_gender'] != 'unclear']['inferred_gender'].value_counts()
if len(gender_clear) > 0:
	fig.add_trace(
		go.Pie(
			labels=gender_clear.index,
			values=gender_clear.values,
			name='Gender'
		),
		row=1, col=2
	)

fig.update_xaxes(title_text='Gender', row=1, col=1)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_layout(height=500, showlegend=False)

fig.show()

# Calculate unclear rate
unclear_rate = (df['inferred_gender'] == 'unclear').sum() / len(df) * 100
print(f"\nGender 'unclear' rate: {unclear_rate:.2f}%")

## Race/Ethnicity Distribution

In [None]:
# Race/ethnicity distribution
race_counts = df['inferred_race_ethnicity'].value_counts()
print("Race/Ethnicity Distribution:")
print(race_counts)
print(f"\nPercentages:")
print((race_counts / len(df) * 100).round(2))

In [None]:
# Plot race/ethnicity distribution
race_data = df['inferred_race_ethnicity'].value_counts().sort_values(ascending=True)

fig = go.Figure(data=[
	go.Bar(
		y=race_data.index,
		x=race_data.values,
		text=race_data.values,
		textposition='outside',
		orientation='h',
		marker_color='seagreen'
	)
])

fig.update_layout(
	title='Distribution of Inferred Race/Ethnicity',
	xaxis_title='Count',
	yaxis_title='Race/Ethnicity',
	height=600
)

fig.show()

# Calculate unclear rate
unclear_rate = (df['inferred_race_ethnicity'] == 'unclear').sum() / len(df) * 100
print(f"\nRace/ethnicity 'unclear' rate: {unclear_rate:.2f}%")

## Confidence Level Analysis

In [None]:
# Confidence level distribution
print("Confidence Level Statistics:")
print(df['confidence_level'].describe())
print(f"\nConfidence ranges:")
print(f"Very low (0.0-0.2): {((df['confidence_level'] <= 0.2).sum() / len(df) * 100):.2f}%")
print(f"Low (0.2-0.4): {(((df['confidence_level'] > 0.2) & (df['confidence_level'] <= 0.4)).sum() / len(df) * 100):.2f}%")
print(f"Medium (0.4-0.6): {(((df['confidence_level'] > 0.4) & (df['confidence_level'] <= 0.6)).sum() / len(df) * 100):.2f}%")
print(f"Medium-High (0.6-0.8): {(((df['confidence_level'] > 0.6) & (df['confidence_level'] <= 0.8)).sum() / len(df) * 100):.2f}%")
print(f"High (0.8-1.0): {((df['confidence_level'] > 0.8).sum() / len(df) * 100):.2f}%")

In [None]:
# Plot confidence distribution
fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Distribution of Confidence Levels', 'Confidence Level Box Plot')
)

# Histogram
fig.add_trace(
	go.Histogram(
		x=df['confidence_level'],
		nbinsx=20,
		marker_color='mediumpurple',
		name='Count'
	),
	row=1, col=1
)

# Add mean and median lines
mean_conf = df['confidence_level'].mean()
median_conf = df['confidence_level'].median()

fig.add_vline(x=mean_conf, line_dash="dash", line_color="red", 
			  annotation_text=f"Mean: {mean_conf:.2f}",
			  annotation_position="top right", row=1, col=1)
fig.add_vline(x=median_conf, line_dash="dash", line_color="orange",
			  annotation_text=f"Median: {median_conf:.2f}",
			  annotation_position="top left", row=1, col=1)

# Box plot
fig.add_trace(
	go.Box(
		y=df['confidence_level'],
		marker_color='mediumpurple',
		name='All Inferences'
	),
	row=1, col=2
)

fig.update_xaxes(title_text='Confidence Level', row=1, col=1)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Confidence Level', row=1, col=2)
fig.update_layout(height=500, showlegend=False)

fig.show()

## Confidence by Demographic Category

In [None]:
# Confidence by age range
age_confidence = df.groupby('inferred_age_range')['confidence_level'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)
print("Confidence by Age Range:")
print(age_confidence)

In [None]:
# Confidence by gender
gender_confidence = df.groupby('inferred_gender')['confidence_level'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)
print("Confidence by Gender:")
print(gender_confidence)

In [None]:
# Confidence by race/ethnicity
race_confidence = df.groupby('inferred_race_ethnicity')['confidence_level'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)
print("Confidence by Race/Ethnicity:")
print(race_confidence)

In [None]:
# Visualize confidence by category
age_order = ['under_18', '18-24', '25-34', '35-44', '45-54', '55-64', '65_plus', 'unclear']
gender_order = ['male', 'female', 'non_binary', 'unclear']

age_conf_plot = df.groupby('inferred_age_range')['confidence_level'].mean().reindex(age_order, fill_value=0)
gender_conf_plot = df.groupby('inferred_gender')['confidence_level'].mean().reindex(gender_order, fill_value=0)
race_conf_plot = df.groupby('inferred_race_ethnicity')['confidence_level'].mean().sort_values(ascending=False)

fig = make_subplots(
	rows=3, cols=1,
	subplot_titles=('Mean Confidence Level by Age Range', 
					'Mean Confidence Level by Gender',
					'Mean Confidence Level by Race/Ethnicity'),
	vertical_spacing=0.12
)

# Age
fig.add_trace(
	go.Bar(x=age_conf_plot.index, y=age_conf_plot.values, 
		   marker_color='steelblue', showlegend=False),
	row=1, col=1
)

# Gender
fig.add_trace(
	go.Bar(x=gender_conf_plot.index, y=gender_conf_plot.values,
		   marker_color='coral', showlegend=False),
	row=2, col=1
)

# Race/Ethnicity (horizontal)
fig.add_trace(
	go.Bar(y=race_conf_plot.index, x=race_conf_plot.values,
		   orientation='h', marker_color='seagreen', showlegend=False),
	row=3, col=1
)

fig.update_yaxes(title_text='Mean Confidence', range=[0, 1], row=1, col=1)
fig.update_yaxes(title_text='Mean Confidence', range=[0, 1], row=2, col=1)
fig.update_xaxes(title_text='Mean Confidence', range=[0, 1], row=3, col=1)

fig.update_layout(height=1000)
fig.show()

## Overall "Unclear" Rate Analysis

In [None]:
# Calculate unclear rates for each category
age_unclear = (df['inferred_age_range'] == 'unclear').sum()
gender_unclear = (df['inferred_gender'] == 'unclear').sum()
race_unclear = (df['inferred_race_ethnicity'] == 'unclear').sum()

# How many profiles have all three as unclear?
all_unclear = ((df['inferred_age_range'] == 'unclear') & 
			   (df['inferred_gender'] == 'unclear') & 
			   (df['inferred_race_ethnicity'] == 'unclear')).sum()

# How many have at least one clear inference?
at_least_one_clear = ((df['inferred_age_range'] != 'unclear') | 
					  (df['inferred_gender'] != 'unclear') | 
					  (df['inferred_race_ethnicity'] != 'unclear')).sum()

print("Unclear Rate Summary:")
print(f"Age unclear: {age_unclear} ({age_unclear/len(df)*100:.2f}%)")
print(f"Gender unclear: {gender_unclear} ({gender_unclear/len(df)*100:.2f}%)")
print(f"Race/ethnicity unclear: {race_unclear} ({race_unclear/len(df)*100:.2f}%)")
print(f"\nAll three unclear: {all_unclear} ({all_unclear/len(df)*100:.2f}%)")
print(f"At least one clear inference: {at_least_one_clear} ({at_least_one_clear/len(df)*100:.2f}%)")

In [None]:
# Plot unclear rates
categories = ['Age', 'Gender', 'Race/Ethnicity', 'All Three', 'At Least One Clear']
unclear_counts = [
	age_unclear,
	gender_unclear,
	race_unclear,
	all_unclear,
	at_least_one_clear
]
colors = ['steelblue', 'coral', 'seagreen', 'gray', 'gold']

# Create text labels with counts and percentages
text_labels = [f'{int(count)}<br>({count/len(df)*100:.1f}%)' for count in unclear_counts]

fig = go.Figure(data=[
	go.Bar(
		x=categories,
		y=unclear_counts,
		text=text_labels,
		textposition='outside',
		marker_color=colors
	)
])

fig.update_layout(
	title='"Unclear" Inference Counts by Category',
	xaxis_title='',
	yaxis_title='Count',
	height=500
)

fig.show()

## Demographic Combinations

In [None]:
# Most common demographic combinations (excluding unclear)
df_clear = df[
	(df['inferred_age_range'] != 'unclear') |
	(df['inferred_gender'] != 'unclear') |
	(df['inferred_race_ethnicity'] != 'unclear')
]

combo_counts = df_clear.groupby(['inferred_gender', 'inferred_age_range', 'inferred_race_ethnicity']).size().sort_values(ascending=False).head(20)
print("Top 20 Demographic Combinations (with at least one clear inference):")
print(combo_counts)

In [None]:
# Gender x Age crosstab
gender_age = pd.crosstab(df['inferred_gender'], df['inferred_age_range'])
print("\nGender x Age Crosstab:")
print(gender_age)

In [None]:
# Visualize gender x age heatmap
age_order = ['under_18', '18-24', '25-34', '35-44', '45-54', '55-64', '65_plus', 'unclear']
gender_order = ['male', 'female', 'non_binary', 'unclear']

gender_age_ordered = pd.crosstab(df['inferred_gender'], df['inferred_age_range'])
gender_age_ordered = gender_age_ordered.reindex(index=gender_order, columns=age_order, fill_value=0)

fig = go.Figure(data=go.Heatmap(
	z=gender_age_ordered.values,
	x=gender_age_ordered.columns,
	y=gender_age_ordered.index,
	colorscale='YlOrRd',
	text=gender_age_ordered.values,
	texttemplate='%{text}',
	textfont={"size": 12},
	colorbar=dict(title="Count")
))

fig.update_layout(
	title='Gender x Age Range Heatmap',
	xaxis_title='Age Range',
	yaxis_title='Gender',
	height=500
)

fig.show()

## Reasoning Analysis

In [None]:
# Sample reasoning text
print("Sample Reasoning Texts:")
print("\n" + "="*80 + "\n")

# Show examples with different confidence levels
high_conf = df[df['confidence_level'] >= 0.8].sample(min(3, len(df[df['confidence_level'] >= 0.8])))
med_conf = df[(df['confidence_level'] >= 0.4) & (df['confidence_level'] < 0.6)].sample(min(3, len(df[(df['confidence_level'] >= 0.4) & (df['confidence_level'] < 0.6)])))
low_conf = df[df['confidence_level'] <= 0.2].sample(min(3, len(df[df['confidence_level'] <= 0.2])))

print("HIGH CONFIDENCE (≥0.8) Examples:\n")
for idx, row in high_conf.iterrows():
	print(f"User: {row['channel_title']}")
	print(f"Age: {row['inferred_age_range']}, Gender: {row['inferred_gender']}, Race: {row['inferred_race_ethnicity']}")
	print(f"Confidence: {row['confidence_level']:.2f}")
	print(f"Reasoning: {row['reasoning']}")
	print("\n" + "-"*80 + "\n")

print("MEDIUM CONFIDENCE (0.4-0.6) Examples:\n")
for idx, row in med_conf.iterrows():
	print(f"User: {row['channel_title']}")
	print(f"Age: {row['inferred_age_range']}, Gender: {row['inferred_gender']}, Race: {row['inferred_race_ethnicity']}")
	print(f"Confidence: {row['confidence_level']:.2f}")
	print(f"Reasoning: {row['reasoning']}")
	print("\n" + "-"*80 + "\n")

print("LOW CONFIDENCE (≤0.2) Examples:\n")
for idx, row in low_conf.iterrows():
	print(f"User: {row['channel_title']}")
	print(f"Age: {row['inferred_age_range']}, Gender: {row['inferred_gender']}, Race: {row['inferred_race_ethnicity']}")
	print(f"Confidence: {row['confidence_level']:.2f}")
	print(f"Reasoning: {row['reasoning']}")
	print("\n" + "-"*80 + "\n")

In [None]:
# Common words/phrases in reasoning
from collections import Counter
import re

# Extract common phrases
all_reasoning = ' '.join(df['reasoning'].dropna().astype(str))
words = re.findall(r'\b\w+\b', all_reasoning.lower())

# Filter out common stop words
stop_words = {'the', 'a', 'an', 'is', 'are', 'to', 'of', 'in', 'for', 'and', 'or', 'not', 'with', 'as', 'at', 'by', 'from', 'on', 'this', 'that', 'be', 'has', 'have', 'had', 'no'}
filtered_words = [w for w in words if w not in stop_words and len(w) > 3]

word_counts = Counter(filtered_words).most_common(30)
print("\nTop 30 Most Common Words in Reasoning:")
for word, count in word_counts:
	print(f"{word}: {count}")

## Summary Statistics

In [None]:
print("="*80)
print("DEMOGRAPHIC INFERENCE SUMMARY")
print("="*80)
print(f"\nTotal users analyzed: {len(df)}")
print(f"\nMean confidence: {df['confidence_level'].mean():.3f}")
print(f"Median confidence: {df['confidence_level'].median():.3f}")

print(f"\n--- Age ---")
print(f"Most common: {df['inferred_age_range'].mode()[0]} ({df['inferred_age_range'].value_counts().iloc[0]} users)")
print(f"Unclear rate: {(df['inferred_age_range'] == 'unclear').sum() / len(df) * 100:.2f}%")

print(f"\n--- Gender ---")
print(f"Most common: {df['inferred_gender'].mode()[0]} ({df['inferred_gender'].value_counts().iloc[0]} users)")
print(f"Unclear rate: {(df['inferred_gender'] == 'unclear').sum() / len(df) * 100:.2f}%")

print(f"\n--- Race/Ethnicity ---")
print(f"Most common: {df['inferred_race_ethnicity'].mode()[0]} ({df['inferred_race_ethnicity'].value_counts().iloc[0]} users)")
print(f"Unclear rate: {(df['inferred_race_ethnicity'] == 'unclear').sum() / len(df) * 100:.2f}%")

print(f"\n--- Overall Clarity ---")
print(f"Profiles with all demographics clear: {((df['inferred_age_range'] != 'unclear') & (df['inferred_gender'] != 'unclear') & (df['inferred_race_ethnicity'] != 'unclear')).sum()} ({((df['inferred_age_range'] != 'unclear') & (df['inferred_gender'] != 'unclear') & (df['inferred_race_ethnicity'] != 'unclear')).sum() / len(df) * 100:.2f}%)")
print(f"Profiles with at least one clear inference: {at_least_one_clear} ({at_least_one_clear / len(df) * 100:.2f}%)")
print(f"Profiles with all unclear: {all_unclear} ({all_unclear / len(df) * 100:.2f}%)")
print("\n" + "="*80)