# NYC Congestion Pricing Sentiment Analysis

Analysis of YouTube comments on NYC congestion pricing, including sentiment, stance, and tone classification.

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import glob
import os
from datetime import datetime

## 1. Data Loading & Exploration

In [2]:
# Load the latest labeled comments CSV
csv_files = glob.glob("data/labeled_comments_*.csv")
if not csv_files:
	print("Error: No labeled comments CSV files found")
else:
	latest_file = max(csv_files, key=os.path.getmtime)
	print(f"Loading: {latest_file}")
	df = pd.read_csv(latest_file)
	print(f"\nLoaded {len(df):,} comments")

Loading: data/labeled_comments_20251008_2353.csv

Loaded 8,673 comments


In [3]:
# Display basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8673 entries, 0 to 8672
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   video_id                           8673 non-null   object 
 1   relevance_rank                     8673 non-null   int64  
 2   video_title                        8673 non-null   object 
 3   video_channel                      8673 non-null   object 
 4   video_published_at                 8673 non-null   object 
 5   video_view_count                   8673 non-null   int64  
 6   video_like_count                   8673 non-null   int64  
 7   video_comment_count                8673 non-null   int64  
 8   video_duration                     8673 non-null   int64  
 9   video_description                  7971 non-null   object 
 10  author                             8665 non-null   object 
 11  comment_text                       8673 non-null   objec

In [4]:
# Check for missing values in key columns
key_columns = ['sentiment', 'stance_congestion_pricing_comment', 'stance_confidence_comment', 'tone']
print("Missing values in key columns:")
print(df[key_columns].isnull().sum())

Missing values in key columns:
sentiment                            0
stance_congestion_pricing_comment    0
stance_confidence_comment            0
tone                                 0
dtype: int64


In [5]:
# Sample rows
df.head(10)

Unnamed: 0,video_id,relevance_rank,video_title,video_channel,video_published_at,video_view_count,video_like_count,video_comment_count,video_duration,video_description,...,comment_like_count,comment_published_at,summary_text,stance_congestion_pricing,stance_confidence,row_index,sentiment,stance_congestion_pricing_comment,stance_confidence_comment,tone
0,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-09-08T12:41:53Z,The video explores the initial impacts of NYC ...,supportive,0.8,0,negative,skeptical,0.8,frustrated
1,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,1,2025-09-08T01:29:20Z,The video explores the initial impacts of NYC ...,supportive,0.8,1,neutral,neutral_or_unclear,0.5,informative
2,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-09-08T01:28:31Z,The video explores the initial impacts of NYC ...,supportive,0.8,2,very_negative,strongly_oppose,1.0,angry
3,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-09-06T13:02:55Z,The video explores the initial impacts of NYC ...,supportive,0.8,3,very_negative,strongly_oppose,0.9,angry
4,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-09-04T09:24:29Z,The video explores the initial impacts of NYC ...,supportive,0.8,4,neutral,neutral_or_unclear,0.7,informative
5,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-09-02T04:41:13Z,The video explores the initial impacts of NYC ...,supportive,0.8,5,very_negative,strongly_oppose,0.9,angry
6,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-08-26T03:22:09Z,The video explores the initial impacts of NYC ...,supportive,0.8,6,very_negative,strongly_oppose,1.0,sarcastic
7,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,1,2025-08-16T11:50:50Z,The video explores the initial impacts of NYC ...,supportive,0.8,7,very_negative,strongly_oppose,0.9,angry
8,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-07-25T14:18:33Z,The video explores the initial impacts of NYC ...,supportive,0.8,8,neutral,neutral_or_unclear,0.5,neutral
9,_XxgFbdgu1c,1,NYC Congestion Pricing’s Unexpected Outcomes,The New York Times,2025-05-13T13:30:34Z,102017,5127,365,138,After New York City's congestion pricing took ...,...,0,2025-06-09T14:17:30Z,The video explores the initial impacts of NYC ...,supportive,0.8,9,very_negative,strongly_oppose,0.9,frustrated


## 2. Sentiment Distribution

In [6]:
# Sentiment distribution
sentiment_order = ['very_negative', 'negative', 'neutral', 'positive', 'very_positive']
sentiment_counts = df['sentiment'].value_counts().reindex(sentiment_order, fill_value=0)
colors_sentiment = ['#d62728', '#ff7f0e', '#7f7f7f', '#2ca02c', '#1f77b4']

# Create subplots: bar chart and pie chart
fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Sentiment Distribution', 'Sentiment Percentage Breakdown'),
	specs=[[{'type': 'bar'}, {'type': 'pie'}]]
)

# Bar chart
fig.add_trace(
	go.Bar(
		x=sentiment_order,
		y=sentiment_counts.values,
		marker_color=colors_sentiment,
		text=sentiment_counts.values,
		textposition='outside',
		showlegend=False
	),
	row=1, col=1
)

# Pie chart
fig.add_trace(
	go.Pie(
		labels=sentiment_order,
		values=sentiment_counts.values,
		marker_colors=colors_sentiment,
		textinfo='label+percent'
	),
	row=1, col=2
)

fig.update_xaxes(title_text='Sentiment', row=1, col=1)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_layout(height=500, showlegend=False, title_text='Sentiment Analysis')
fig.show()

print("\nSentiment Summary:")
print(sentiment_counts)
print(f"\nTotal: {sentiment_counts.sum():,}")


Sentiment Summary:
sentiment
very_negative    2264
negative         2044
neutral          3461
positive          764
very_positive     140
Name: count, dtype: int64

Total: 8,673


## 3. Stance Analysis

In [7]:
# Comment stance distribution
stance_order = ['strongly_oppose', 'skeptical', 'neutral_or_unclear', 'supportive', 'strongly_supportive']
stance_counts = df['stance_congestion_pricing_comment'].value_counts().reindex(stance_order, fill_value=0)

fig = go.Figure(
	go.Bar(
		x=stance_order,
		y=stance_counts.values,
		marker_color=colors_sentiment,
		text=stance_counts.values,
		textposition='outside'
	)
)

fig.update_layout(
	title='Comment Stance Distribution on Congestion Pricing',
	xaxis_title='Stance',
	yaxis_title='Count',
	height=500
)
fig.show()

print("\nComment Stance Summary:")
print(stance_counts)
print(f"\nTotal: {stance_counts.sum():,}")


Comment Stance Summary:
stance_congestion_pricing_comment
strongly_oppose        2900
skeptical              1233
neutral_or_unclear     3728
supportive              439
strongly_supportive     373
Name: count, dtype: int64

Total: 8,673


### Video Stance vs Comment Stance

In [8]:
# Create crosstab of video stance vs comment stance
video_stance_order = ['strongly_oppose', 'skeptical', 'neutral_or_mixed', 'supportive', 'strongly_supportive', 'unclear']
comment_stance_order = ['strongly_oppose', 'skeptical', 'neutral_or_unclear', 'supportive', 'strongly_supportive']

crosstab = pd.crosstab(
	df['stance_congestion_pricing'],
	df['stance_congestion_pricing_comment'],
	normalize='index'
) * 100

# Reorder for better visualization
crosstab = crosstab.reindex(index=video_stance_order, columns=comment_stance_order, fill_value=0)

fig = go.Figure(
	go.Heatmap(
		z=crosstab.values,
		x=comment_stance_order,
		y=video_stance_order,
		colorscale='RdYlGn',
		zmid=20,
		text=crosstab.values.round(1),
		texttemplate='%{text:.1f}',
		textfont={'size': 10},
		colorbar=dict(title='Percentage (%)')
	)
)

fig.update_layout(
	title='Video Stance vs Comment Stance (% within each video stance)',
	xaxis_title='Comment Stance',
	yaxis_title='Video Stance',
	height=600
)
fig.show()

print("\nInterpretation: Each row shows the distribution of comment stances for videos with that stance.")
print("Example: If a video is 'supportive', what % of its comments are 'strongly_oppose' vs 'supportive'?")


Interpretation: Each row shows the distribution of comment stances for videos with that stance.
Example: If a video is 'supportive', what % of its comments are 'strongly_oppose' vs 'supportive'?


In [9]:
# Calculate stance agreement rate
# Simplify to oppose/neutral/support for both video and comment
def simplify_stance(stance, is_video=False):
	if pd.isna(stance):
		return 'neutral'
	if 'oppose' in stance:
		return 'oppose'
	elif 'supportive' in stance:
		return 'support'
	else:
		return 'neutral'

df['video_stance_simple'] = df['stance_congestion_pricing'].apply(lambda x: simplify_stance(x, is_video=True))
df['comment_stance_simple'] = df['stance_congestion_pricing_comment'].apply(simplify_stance)

# Calculate agreement
agreement = (df['video_stance_simple'] == df['comment_stance_simple']).sum()
total = len(df)
agreement_rate = (agreement / total) * 100

print(f"\nStance Agreement Rate: {agreement_rate:.1f}%")
print(f"({agreement:,} out of {total:,} comments align with video stance)")

# Show disagreement patterns
disagreement = df[df['video_stance_simple'] != df['comment_stance_simple']]
print(f"\nDisagreement patterns ({len(disagreement):,} comments):")
print(pd.crosstab(disagreement['video_stance_simple'], disagreement['comment_stance_simple']))


Stance Agreement Rate: 41.6%
(3,606 out of 8,673 comments align with video stance)

Disagreement patterns (5,067 comments):
comment_stance_simple  neutral  oppose  support
video_stance_simple                            
neutral                      0    1990      425
oppose                     145       0       14
support                   1754     739        0


## 4. Tone Analysis

In [10]:
# Tone distribution
tone_counts = df['tone'].value_counts().sort_values(ascending=False)

fig = go.Figure(
	go.Bar(
		x=tone_counts.index,
		y=tone_counts.values,
		marker_color='steelblue',
		text=tone_counts.values,
		textposition='outside'
	)
)

fig.update_layout(
	title='Comment Tone Distribution',
	xaxis_title='Tone',
	yaxis_title='Count',
	height=500
)
fig.show()

print("\nTone Summary:")
print(tone_counts)


Tone Summary:
tone
frustrated     2155
angry          1876
neutral        1620
informative    1048
humorous        781
supportive      683
sarcastic       303
mixed           207
Name: count, dtype: int64


In [11]:
# Tone by sentiment
tone_sentiment_crosstab = pd.crosstab(df['tone'], df['sentiment'], normalize='index') * 100
tone_sentiment_crosstab = tone_sentiment_crosstab[sentiment_order]

fig = go.Figure(
	go.Heatmap(
		z=tone_sentiment_crosstab.values,
		x=sentiment_order,
		y=tone_sentiment_crosstab.index,
		colorscale='RdBu_r',
		zmid=20,
		text=tone_sentiment_crosstab.values.round(1),
		texttemplate='%{text:.1f}',
		textfont={'size': 10},
		colorbar=dict(title='Percentage (%)')
	)
)

fig.update_layout(
	title='Tone vs Sentiment (% within each tone)',
	xaxis_title='Sentiment',
	yaxis_title='Tone',
	height=600
)
fig.show()

## 5. Confidence Analysis

In [12]:
# Stance confidence distribution
fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Stance Confidence Distribution', 'Confidence by Comment Stance'),
	specs=[[{'type': 'histogram'}, {'type': 'box'}]]
)

# Histogram
mean_confidence = df['stance_confidence_comment'].mean()
fig.add_trace(
	go.Histogram(
		x=df['stance_confidence_comment'].dropna(),
		nbinsx=20,
		marker_color='steelblue',
		name='Confidence'
	),
	row=1, col=1
)

# Add mean line
fig.add_vline(
	x=mean_confidence,
	line_dash='dash',
	line_color='red',
	annotation_text=f'Mean: {mean_confidence:.3f}',
	row=1, col=1
)

# Box plot by stance
for stance in stance_order:
	data = df[df['stance_congestion_pricing_comment'] == stance]['stance_confidence_comment']
	fig.add_trace(
		go.Box(
			y=data,
			name=stance,
			showlegend=False
		),
		row=1, col=2
	)

fig.update_xaxes(title_text='Confidence Score (0-1)', row=1, col=1)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_xaxes(title_text='Comment Stance', row=1, col=2)
fig.update_yaxes(title_text='Confidence Score', row=1, col=2)
fig.update_layout(height=500, showlegend=False)
fig.show()

print("\nConfidence Statistics:")
print(df['stance_confidence_comment'].describe())


Confidence Statistics:
count    8673.000000
mean        0.720287
std         0.203865
min         0.000000
25%         0.500000
50%         0.800000
75%         0.900000
max         1.000000
Name: stance_confidence_comment, dtype: float64


In [13]:
# Average confidence by sentiment
confidence_by_sentiment = df.groupby('sentiment')['stance_confidence_comment'].mean().reindex(sentiment_order)

fig = go.Figure(
	go.Bar(
		x=sentiment_order,
		y=confidence_by_sentiment.values,
		marker_color='coral',
		text=[f'{v:.3f}' if not pd.isna(v) else '' for v in confidence_by_sentiment.values],
		textposition='outside'
	)
)

fig.update_layout(
	title='Average Stance Confidence by Sentiment',
	xaxis_title='Sentiment',
	yaxis_title='Average Confidence',
	yaxis_range=[0, 1],
	height=500
)
fig.show()

## 6. Video-Level Insights

In [14]:
# Group by video
video_stats = df.groupby(['video_id', 'video_title', 'video_channel', 'stance_congestion_pricing']).agg({
	'comment_text': 'count',
	'video_view_count': 'first',
	'video_like_count': 'first',
	'sentiment': lambda x: (x == 'positive').sum() + (x == 'very_positive').sum(),
	'stance_congestion_pricing_comment': lambda x: (x == 'strongly_oppose').sum() + (x == 'skeptical').sum()
}).reset_index()

video_stats.columns = ['video_id', 'video_title', 'video_channel', 'video_stance', 'comment_count', 
					   'view_count', 'like_count', 'positive_comments', 'opposition_comments']

# Calculate percentages
video_stats['positive_pct'] = (video_stats['positive_comments'] / video_stats['comment_count'] * 100).round(1)
video_stats['opposition_pct'] = (video_stats['opposition_comments'] / video_stats['comment_count'] * 100).round(1)

# Top videos by comment count
print("Top 10 Videos by Comment Count:")
print(video_stats.nlargest(10, 'comment_count')[['video_title', 'video_channel', 'comment_count', 'positive_pct', 'opposition_pct']])

Top 10 Videos by Comment Count:
                                          video_title  \
9              The Battle Over NYC Congestion Pricing   
10  New York Declares War On Traffic (A Congestion...   
33  How Do New Yorkers Really Feel About Congestio...   
21  How Some Drivers Are Avoiding NYC&#39;s Conges...   
7   😱 NYC&#39;s NEW $9 TAX CHANGES EVERYTHING (Con...   
45  What happened during the first weekday of CONG...   
29  Queens residents complain congestion pricing t...   
34  New York City’s New Congestion Pricing, Explained   
23  Congestion pricing toll now in effect for driv...   
28       NYC Congestion Pricing’s Unexpected Outcomes   

              video_channel  comment_count  positive_pct  opposition_pct  
9      Wendover Productions           2093           9.2            47.5  
10             Climate Town           1579          21.6            22.7  
33           The Daily Show            943           9.5            30.6  
21           Inside Edition            6

In [15]:
# Most polarizing videos (highest opposition percentage)
print("\nMost Opposition-Heavy Videos:")
print(video_stats.nlargest(10, 'opposition_pct')[['video_title', 'video_channel', 'comment_count', 'opposition_pct', 'video_stance']])


Most Opposition-Heavy Videos:
                                          video_title  \
42  NYC Congestion Pricing Gains Support Among Res...   
46  Transportation Sec. Sean Duffy calls NYC conge...   
25            NYC congestion pricing toll plan debate   
12  New York officials tout success of congestion ...   
27  NYC congestion pricing: MTA heads to court to ...   
34  New York City’s New Congestion Pricing, Explained   
19  Congestion pricing tolls apply for some Queens...   
3   Will Trump kill NYC&#39;s congestion pricing t...   
11  Trump administration&#39;s deadline to end NYC...   
23  Congestion pricing toll now in effect for driv...   

              video_channel  comment_count  opposition_pct      video_stance  
42           Bloomberg News              6           100.0        supportive  
46         Associated Press              6            83.3   strongly_oppose  
25           FOX 5 New York            103            75.7        supportive  
12             CBS New Yo

In [16]:
# Most positive videos
print("\nMost Positive Videos:")
print(video_stats.nlargest(10, 'positive_pct')[['video_title', 'video_channel', 'comment_count', 'positive_pct', 'video_stance']])


Most Positive Videos:
                                          video_title       video_channel  \
35  NYC Transit CFO Says Cities Are Asking about C...  Bloomberg Podcasts   
10  New York Declares War On Traffic (A Congestion...        Climate Town   
6      Why was Congestion Pricing implemented in NYC?            Urbanist   
44           The Benefits of NYC’s Congestion Pricing            Urbanist   
20  N.Y. granted legal victory in congestion prici...        CBS New York   
4   Straphangers question MTA fare increase amid c...          PIX11 News   
40  Why the Trump administration is fighting New Y...            CBS News   
46  Transportation Sec. Sean Duffy calls NYC conge...    Associated Press   
0   Is NYC congestion pricing working? What data s...      FOX 5 New York   
22  DOJ memo throws cold water of NYC congestion p...        CBS New York   

    comment_count  positive_pct         video_stance  
35             22          22.7           supportive  
10           1579  

## 7. Temporal Analysis

In [17]:
# Convert date columns to datetime
df['comment_published_at'] = pd.to_datetime(df['comment_published_at'])
df['video_published_at'] = pd.to_datetime(df['video_published_at'])

# Extract month for grouping
df['comment_month'] = df['comment_published_at'].dt.to_period('M').astype(str)
df['video_month'] = df['video_published_at'].dt.to_period('M').astype(str)


Converting to PeriodArray/Index representation will drop timezone information.


Converting to PeriodArray/Index representation will drop timezone information.



In [18]:
# Sentiment over time (by comment date)
sentiment_over_time = df.groupby(['comment_month', 'sentiment']).size().unstack(fill_value=0)
sentiment_over_time = sentiment_over_time[sentiment_order]

fig = go.Figure()

for i, sentiment in enumerate(sentiment_order):
	fig.add_trace(
		go.Bar(
			x=sentiment_over_time.index,
			y=sentiment_over_time[sentiment],
			name=sentiment,
			marker_color=colors_sentiment[i]
		)
	)

fig.update_layout(
	barmode='stack',
	title='Sentiment Distribution Over Time (by Comment Date)',
	xaxis_title='Month',
	yaxis_title='Number of Comments',
	height=500,
	legend_title='Sentiment'
)
fig.show()

In [19]:
# Calculate net sentiment score over time (-2 for very_negative to +2 for very_positive)
sentiment_scores = {
	'very_negative': -2,
	'negative': -1,
	'neutral': 0,
	'positive': 1,
	'very_positive': 2
}

df['sentiment_score'] = df['sentiment'].map(sentiment_scores)
avg_sentiment_over_time = df.groupby('comment_month')['sentiment_score'].mean()

fig = go.Figure()

fig.add_trace(
	go.Scatter(
		x=avg_sentiment_over_time.index,
		y=avg_sentiment_over_time.values,
		mode='lines+markers',
		marker=dict(size=8, color='steelblue'),
		line=dict(width=2, color='steelblue'),
		name='Avg Sentiment'
	)
)

fig.add_hline(
	y=0,
	line_dash='dash',
	line_color='red',
	annotation_text='Neutral'
)

fig.update_layout(
	title='Average Sentiment Score Over Time',
	xaxis_title='Month',
	yaxis_title='Average Sentiment Score',
	height=500
)
fig.show()

## 8. Engagement Analysis

In [20]:
# Video views vs average sentiment
video_engagement = df.groupby('video_id').agg({
	'video_view_count': 'first',
	'video_like_count': 'first',
	'sentiment_score': 'mean',
	'comment_like_count': 'sum'
}).reset_index()

# Create 2x2 subplot
fig = make_subplots(
	rows=2, cols=2,
	subplot_titles=(
		'Video Views vs Average Comment Sentiment',
		'Video Likes vs Average Comment Sentiment',
		'Comment Engagement Distribution (Top 20 like counts)',
		'Average Sentiment by Comment Engagement'
	),
	specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
		   [{'type': 'bar'}, {'type': 'bar'}]]
)

# View count vs avg sentiment (log scale)
fig.add_trace(
	go.Scatter(
		x=video_engagement['video_view_count'],
		y=video_engagement['sentiment_score'],
		mode='markers',
		marker=dict(size=10, opacity=0.6),
		showlegend=False
	),
	row=1, col=1
)
fig.add_hline(y=0, line_dash='dash', line_color='red', opacity=0.5, row=1, col=1)

# Like count vs avg sentiment (log scale)
fig.add_trace(
	go.Scatter(
		x=video_engagement['video_like_count'],
		y=video_engagement['sentiment_score'],
		mode='markers',
		marker=dict(size=10, color='coral', opacity=0.6),
		showlegend=False
	),
	row=1, col=2
)
fig.add_hline(y=0, line_dash='dash', line_color='red', opacity=0.5, row=1, col=2)

# Comment engagement distribution
comment_likes = df['comment_like_count'].value_counts().sort_index()
fig.add_trace(
	go.Bar(
		x=comment_likes.index[:20],
		y=comment_likes.values[:20],
		marker_color='steelblue',
		showlegend=False
	),
	row=2, col=1
)

# Sentiment by comment like count (binned)
df['like_bin'] = pd.cut(df['comment_like_count'], bins=[-1, 0, 1, 5, 10, 100, 10000], 
						labels=['0', '1', '2-5', '6-10', '11-100', '100+'])
like_sentiment = df.groupby('like_bin')['sentiment_score'].mean()
fig.add_trace(
	go.Bar(
		x=like_sentiment.index.astype(str),
		y=like_sentiment.values,
		marker_color='green',
		marker_opacity=0.7,
		showlegend=False
	),
	row=2, col=2
)
fig.add_hline(y=0, line_dash='dash', line_color='red', opacity=0.5, row=2, col=2)

# Update axes
fig.update_xaxes(title_text='Video View Count', type='log', row=1, col=1)
fig.update_yaxes(title_text='Average Sentiment Score', row=1, col=1)
fig.update_xaxes(title_text='Video Like Count', type='log', row=1, col=2)
fig.update_yaxes(title_text='Average Sentiment Score', row=1, col=2)
fig.update_xaxes(title_text='Comment Like Count', row=2, col=1)
fig.update_yaxes(title_text='Number of Comments', row=2, col=1)
fig.update_xaxes(title_text='Comment Like Count (binned)', row=2, col=2)
fig.update_yaxes(title_text='Average Sentiment Score', row=2, col=2)

fig.update_layout(height=900, showlegend=False)
fig.show()

# Correlation analysis
print("\nCorrelations with sentiment score:")
print(f"Video views: {video_engagement['video_view_count'].corr(video_engagement['sentiment_score']):.3f}")
print(f"Video likes: {video_engagement['video_like_count'].corr(video_engagement['sentiment_score']):.3f}")






Correlations with sentiment score:
Video views: 0.257
Video likes: 0.339


## 9. Cross-Tabulations

In [21]:
# Sentiment × Stance matrix
sentiment_stance = pd.crosstab(df['sentiment'], df['stance_congestion_pricing_comment'])
sentiment_stance = sentiment_stance.reindex(index=sentiment_order, columns=stance_order, fill_value=0)

fig = go.Figure(
	go.Heatmap(
		z=sentiment_stance.values,
		x=stance_order,
		y=sentiment_order,
		colorscale='YlOrRd',
		text=sentiment_stance.values,
		texttemplate='%{text}',
		textfont={'size': 12},
		colorbar=dict(title='Count')
	)
)

fig.update_layout(
	title='Sentiment × Stance Cross-Tabulation',
	xaxis_title='Stance on Congestion Pricing',
	yaxis_title='Sentiment',
	height=600
)
fig.show()

In [22]:
# Tone × Sentiment matrix
tone_sentiment = pd.crosstab(df['tone'], df['sentiment'])
tone_sentiment = tone_sentiment[sentiment_order]

fig = go.Figure(
	go.Heatmap(
		z=tone_sentiment.values,
		x=sentiment_order,
		y=tone_sentiment.index,
		colorscale='Blues',
		text=tone_sentiment.values,
		texttemplate='%{text}',
		textfont={'size': 12},
		colorbar=dict(title='Count')
	)
)

fig.update_layout(
	title='Tone × Sentiment Cross-Tabulation',
	xaxis_title='Sentiment',
	yaxis_title='Tone',
	height=600
)
fig.show()

## 10. Key Statistics Summary

In [23]:
print("=" * 60)
print("NYC CONGESTION PRICING SENTIMENT ANALYSIS - KEY FINDINGS")
print("=" * 60)

print(f"\n📊 Dataset Overview:")
print(f"   Total Comments: {len(df):,}")
print(f"   Total Videos: {df['video_id'].nunique():,}")
print(f"   Date Range: {df['comment_published_at'].min().date()} to {df['comment_published_at'].max().date()}")

print(f"\n😊 Sentiment Breakdown:")
for sentiment in sentiment_order:
	count = sentiment_counts.get(sentiment, 0)
	pct = (count / len(df) * 100) if len(df) > 0 else 0
	print(f"   {sentiment:20s}: {count:5,} ({pct:5.1f}%)")

print(f"\n📈 Stance Breakdown:")
for stance in stance_order:
	count = stance_counts.get(stance, 0)
	pct = (count / len(df) * 100) if len(df) > 0 else 0
	print(f"   {stance:20s}: {count:5,} ({pct:5.1f}%)")

print(f"\n🎭 Most Common Tone:")
top_tone = tone_counts.index[0]
top_tone_count = tone_counts.iloc[0]
top_tone_pct = (top_tone_count / len(df) * 100)
print(f"   {top_tone}: {top_tone_count:,} comments ({top_tone_pct:.1f}%)")

print(f"\n🎯 Stance Agreement:")
print(f"   Video-Comment Alignment: {agreement_rate:.1f}%")
print(f"   Comments aligning with video stance: {agreement:,} / {total:,}")

print(f"\n✅ Confidence Statistics:")
print(f"   Mean Confidence: {df['stance_confidence_comment'].mean():.3f}")
print(f"   Median Confidence: {df['stance_confidence_comment'].median():.3f}")
print(f"   High Confidence (>0.8): {(df['stance_confidence_comment'] > 0.8).sum():,} comments")

print(f"\n📺 Video Engagement:")
print(f"   Average views per video: {video_stats['view_count'].mean():,.0f}")
print(f"   Average comments per video: {video_stats['comment_count'].mean():.1f}")
print(f"   Most commented video: {video_stats.nlargest(1, 'comment_count')['comment_count'].iloc[0]:,} comments")

print("\n" + "=" * 60)

NYC CONGESTION PRICING SENTIMENT ANALYSIS - KEY FINDINGS

📊 Dataset Overview:
   Total Comments: 8,673
   Total Videos: 49
   Date Range: 2022-08-12 to 2025-09-30

😊 Sentiment Breakdown:
   very_negative       : 2,264 ( 26.1%)
   negative            : 2,044 ( 23.6%)
   neutral             : 3,461 ( 39.9%)
   positive            :   764 (  8.8%)
   very_positive       :   140 (  1.6%)

📈 Stance Breakdown:
   strongly_oppose     : 2,900 ( 33.4%)
   skeptical           : 1,233 ( 14.2%)
   neutral_or_unclear  : 3,728 ( 43.0%)
   supportive          :   439 (  5.1%)
   strongly_supportive :   373 (  4.3%)

🎭 Most Common Tone:
   frustrated: 2,155 comments (24.8%)

🎯 Stance Agreement:
   Video-Comment Alignment: 41.6%
   Comments aligning with video stance: 3,606 / 8,673

✅ Confidence Statistics:
   Mean Confidence: 0.720
   Median Confidence: 0.800
   High Confidence (>0.8): 3,457 comments

📺 Video Engagement:
   Average views per video: 83,952
   Average comments per video: 177.0
   Most c

## 11. Interactive Data Explorer

Use the filters below to explore specific comments and their context:

In [24]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Clear any previous widget states
try:
	del sentiment_filter, stance_filter, confidence_slider, tone_filter, sort_by, top_n, output_area
except:
	pass

# Create filter widgets
sentiment_filter = widgets.SelectMultiple(
	options=['all'] + sentiment_order,
	value=['all'],
	description='Sentiment:',
	disabled=False,
	style={'description_width': '120px'}
)

stance_filter = widgets.SelectMultiple(
	options=['all'] + stance_order,
	value=['all'],
	description='Stance:',
	disabled=False,
	style={'description_width': '120px'}
)

confidence_slider = widgets.FloatSlider(
	value=0.0,
	min=0.0,
	max=1.0,
	step=0.1,
	description='Min Confidence:',
	disabled=False,
	continuous_update=False,
	orientation='horizontal',
	readout=True,
	readout_format='.1f',
	style={'description_width': '120px'}
)

tone_filter = widgets.SelectMultiple(
	options=['all'] + list(tone_counts.index),
	value=['all'],
	description='Tone:',
	disabled=False,
	style={'description_width': '120px'}
)

sort_by = widgets.Dropdown(
	options=[
		('Highest Confidence', 'confidence_high'),
		('Lowest Confidence', 'confidence_low'),
		('Most Comment Likes', 'comment_likes_high'),
		('Least Comment Likes', 'comment_likes_low'),
		('Most Viewed Videos', 'video_views_high'),
		('Least Viewed Videos', 'video_views_low'),
		('Most Recent Comments', 'comment_date_recent'),
		('Oldest Comments', 'comment_date_old')
	],
	value='confidence_high',
	description='Sort By:',
	disabled=False,
	style={'description_width': '120px'}
)

top_n = widgets.IntSlider(
	value=50,
	min=10,
	max=500,
	step=10,
	description='Show Top N:',
	disabled=False,
	continuous_update=False,
	orientation='horizontal',
	readout=True,
	style={'description_width': '120px'}
)

# Single output area
output_area = widgets.Output()

# Function to filter dataframe based on widget values
def filter_data():
	filtered_df = df.copy()
	
	# Filter by sentiment
	if 'all' not in sentiment_filter.value:
		filtered_df = filtered_df[filtered_df['sentiment'].isin(sentiment_filter.value)]
	
	# Filter by stance
	if 'all' not in stance_filter.value:
		filtered_df = filtered_df[filtered_df['stance_congestion_pricing_comment'].isin(stance_filter.value)]
	
	# Filter by confidence
	filtered_df = filtered_df[filtered_df['stance_confidence_comment'] >= confidence_slider.value]
	
	# Filter by tone
	if 'all' not in tone_filter.value:
		filtered_df = filtered_df[filtered_df['tone'].isin(tone_filter.value)]
	
	# Sort based on selection
	if sort_by.value == 'confidence_high':
		filtered_df = filtered_df.sort_values('stance_confidence_comment', ascending=False)
	elif sort_by.value == 'confidence_low':
		filtered_df = filtered_df.sort_values('stance_confidence_comment', ascending=True)
	elif sort_by.value == 'comment_likes_high':
		filtered_df = filtered_df.sort_values('comment_like_count', ascending=False)
	elif sort_by.value == 'comment_likes_low':
		filtered_df = filtered_df.sort_values('comment_like_count', ascending=True)
	elif sort_by.value == 'video_views_high':
		filtered_df = filtered_df.sort_values('video_view_count', ascending=False)
	elif sort_by.value == 'video_views_low':
		filtered_df = filtered_df.sort_values('video_view_count', ascending=True)
	elif sort_by.value == 'comment_date_recent':
		filtered_df = filtered_df.sort_values('comment_published_at', ascending=False)
	elif sort_by.value == 'comment_date_old':
		filtered_df = filtered_df.sort_values('comment_published_at', ascending=True)
	
	# Limit to top N
	filtered_df = filtered_df.head(top_n.value)
	
	return filtered_df

# Function to truncate text for non-comment columns
def truncate_text(text, max_length=50):
	if len(str(text)) > max_length:
		return str(text)[:max_length] + '...'
	return str(text)

# Function to update display
def update_display(*args):
	with output_area:
		clear_output(wait=True)
		
		filtered_df = filter_data()
		
		if len(filtered_df) == 0:
			print("No comments match the current filters")
			return
		
		print(f"Found {len(filtered_df):,} comments matching filters\n")
		
		# Create HTML table
		table_html = """
		<div style="max-height: 600px; overflow-y: auto; border: 1px solid #ddd; margin-top: 10px;">
			<table style="width: 100%; border-collapse: collapse; font-size: 12px;">
				<thead style="position: sticky; top: 0; background-color: #f0f0f0; z-index: 1;">
					<tr style="border-bottom: 2px solid #666;">
						<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Video Title</th>
						<th style="padding: 8px; text-align: left; border: 1px solid #ddd;">Channel</th>
						<th style="padding: 8px; text-align: left; border: 1px solid #ddd; min-width: 300px;">Comment Text</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">Sentiment</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">Stance</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">Conf.</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">Tone</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">C.Likes</th>
						<th style="padding: 8px; text-align: center; border: 1px solid #ddd;">V.Views</th>
					</tr>
				</thead>
				<tbody>
		"""
		
		for i, (idx, row) in enumerate(filtered_df.iterrows()):
			# Alternate row colors
			bg_color = '#f9f9f9' if i % 2 == 0 else '#ffffff'
			
			table_html += f"""
				<tr style="background-color: {bg_color}; border-bottom: 1px solid #ddd;">
					<td style="padding: 8px; border: 1px solid #ddd; max-width: 200px;">{truncate_text(row['video_title'], 40)}</td>
					<td style="padding: 8px; border: 1px solid #ddd; max-width: 120px;">{truncate_text(row['video_channel'], 20)}</td>
					<td style="padding: 8px; border: 1px solid #ddd; min-width: 300px; max-width: 500px; word-wrap: break-word; white-space: normal;">{row['comment_text']}</td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;"><span style="background-color: #FFCDD2; padding: 2px 4px; border-radius: 3px; font-size: 10px;">{row['sentiment']}</span></td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;"><span style="background-color: #C5CAE9; padding: 2px 4px; border-radius: 3px; font-size: 10px;">{row['stance_congestion_pricing_comment']}</span></td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;">{row['stance_confidence_comment']:.2f}</td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;"><span style="background-color: #D1C4E9; padding: 2px 4px; border-radius: 3px; font-size: 10px;">{row['tone']}</span></td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;">{row['comment_like_count']:,}</td>
					<td style="padding: 8px; text-align: center; border: 1px solid #ddd;">{row['video_view_count']:,}</td>
				</tr>
			"""
		
		table_html += """
				</tbody>
			</table>
		</div>
		"""
		
		display(HTML(table_html))

# Connect widgets to update function
sentiment_filter.observe(update_display, names='value')
stance_filter.observe(update_display, names='value')
confidence_slider.observe(update_display, names='value')
tone_filter.observe(update_display, names='value')
sort_by.observe(update_display, names='value')
top_n.observe(update_display, names='value')

# Display widgets
print("Interactive Data Explorer")
print("=" * 80)

# Layout filters in a grid
filter_box = widgets.VBox([
	widgets.HTML("<h4>Filters</h4>"),
	widgets.HBox([
		widgets.VBox([sentiment_filter, stance_filter]),
		widgets.VBox([tone_filter, confidence_slider])
	]),
	widgets.HTML("<h4>Sorting & Display</h4>"),
	widgets.HBox([sort_by, top_n])
])

display(filter_box)
display(output_area)

# Trigger initial display
with output_area:
	update_display()

Interactive Data Explorer


VBox(children=(HTML(value='<h4>Filters</h4>'), HBox(children=(VBox(children=(SelectMultiple(description='Senti…

Output()