# Weighted Sentiment Analysis

Analysis of NYC congestion pricing sentiment weighted by comment engagement (likes).

This notebook explores how sentiment analysis changes when we weight comments by their engagement (likes). Some comments receive more visibility and influence than others, so weighting by likes provides insight into the sentiment of the most visible discourse.

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import glob
import os
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

## 1. Data Loading

In [2]:
# Load the latest labeled comments CSV
csv_files = glob.glob("data/labeled_comments_*.csv")
if not csv_files:
	print("Error: No labeled comments CSV files found")
else:
	latest_file = max(csv_files, key=os.path.getmtime)
	print(f"Loading: {latest_file}")
	df = pd.read_csv(latest_file)
	print(f"\nLoaded {len(df):,} comments")
	print(f"Total videos: {df['video_id'].nunique():,}")

Loading: data/labeled_comments_20251008_2353.csv

Loaded 8,673 comments
Total videos: 49


In [3]:
# Prepare data: convert dates and create sentiment scores
df['comment_published_at'] = pd.to_datetime(df['comment_published_at'])
df['video_published_at'] = pd.to_datetime(df['video_published_at'])
df['comment_month'] = df['comment_published_at'].dt.to_period('M').astype(str)

# Sentiment scoring
sentiment_scores = {
	'very_negative': -2,
	'negative': -1,
	'neutral': 0,
	'positive': 1,
	'very_positive': 2
}
df['sentiment_score'] = df['sentiment'].map(sentiment_scores)

# Define orderings
sentiment_order = ['very_negative', 'negative', 'neutral', 'positive', 'very_positive']
stance_order = ['strongly_oppose', 'skeptical', 'neutral_or_unclear', 'supportive', 'strongly_supportive']
colors_sentiment = ['#d62728', '#ff7f0e', '#7f7f7f', '#2ca02c', '#1f77b4']

# Simplified stance for agreement calculation
def simplify_stance(stance, is_video=False):
	if pd.isna(stance):
		return 'neutral'
	if 'oppose' in stance:
		return 'oppose'
	elif 'supportive' in stance:
		return 'support'
	else:
		return 'neutral'

df['video_stance_simple'] = df['stance_congestion_pricing'].apply(lambda x: simplify_stance(x, is_video=True))
df['comment_stance_simple'] = df['stance_congestion_pricing_comment'].apply(simplify_stance)

# Like bins for analysis
df['like_bin'] = pd.cut(df['comment_like_count'], bins=[-1, 0, 1, 5, 10, 100, 10000], 
						labels=['0', '1', '2-5', '6-10', '11-100', '100+'])

print("Data prepared successfully")

Data prepared successfully


  df['comment_month'] = df['comment_published_at'].dt.to_period('M').astype(str)


## 2. Weighted Sentiment Setup

Some comments receive more engagement (likes) than others. To understand the sentiment of the most visible/influential comments, we can weight sentiment scores by comment likes using a logarithmic function to prevent viral comments from completely dominating the analysis.

**Weighting Formula:** `weight = 1 + α × log(1 + likes)`

where α controls the influence of likes:
- α = 0: no weighting (all comments equal)
- α = 0.5: moderate influence (default)
- α = 1.0: full logarithmic influence
- α > 1.0: increased influence of popular comments

In [None]:
# Create interactive alpha parameter control
alpha_slider = widgets.FloatSlider(
	value=0.5,
	min=0.0,
	max=10.0,
	step=0.1,
	description='Alpha (α):',
	disabled=False,
	continuous_update=False,
	orientation='horizontal',
	readout=True,
	readout_format='.1f',
	style={'description_width': '120px'}
)

# Output widget for displaying weight statistics
weight_output = widgets.Output()

# Function to calculate weights based on alpha
def calculate_weights(likes, alpha):
	return 1 + alpha * np.log1p(likes)

# Function to update weights and display statistics
def update_weights(change):
	with weight_output:
		clear_output(wait=True)
		
		alpha = alpha_slider.value
		df['weight'] = calculate_weights(df['comment_like_count'], alpha)
		
		print(f"\nWeight Statistics (α = {alpha}):")
		print(f"  Mean weight: {df['weight'].mean():.3f}")
		print(f"  Median weight: {df['weight'].median():.3f}")
		print(f"  Max weight: {df['weight'].max():.3f}")
		print(f"  Std weight: {df['weight'].std():.3f}")
		
		# Calculate weighted vs unweighted sentiment
		weighted_sentiment = (df['sentiment_score'] * df['weight']).sum() / df['weight'].sum()
		unweighted_sentiment = df['sentiment_score'].mean()
		
		print(f"\n  Unweighted avg sentiment: {unweighted_sentiment:.3f}")
		print(f"  Weighted avg sentiment: {weighted_sentiment:.3f}")
		print(f"  Difference: {weighted_sentiment - unweighted_sentiment:+.3f}")
		
		# Show weight distribution
		fig = make_subplots(
			rows=1, cols=2,
			subplot_titles=('Weight Distribution', 'Weight vs Comment Likes'),
			specs=[[{'type': 'histogram'}, {'type': 'scatter'}]]
		)
		
		# Histogram of weights
		fig.add_trace(
			go.Histogram(
				x=df['weight'],
				nbinsx=50,
				marker_color='steelblue',
				name='Weight'
			),
			row=1, col=1
		)
		
		# Scatter: likes vs weight
		fig.add_trace(
			go.Scatter(
				x=df['comment_like_count'],
				y=df['weight'],
				mode='markers',
				marker=dict(size=4, opacity=0.5, color='coral'),
				name='Weight vs Likes'
			),
			row=1, col=2
		)
		
		fig.update_xaxes(title_text='Weight', row=1, col=1)
		fig.update_yaxes(title_text='Count', row=1, col=1)
		fig.update_xaxes(title_text='Comment Likes', type='log', row=1, col=2)
		fig.update_yaxes(title_text='Weight', row=1, col=2)
		fig.update_layout(height=400, showlegend=False)
		fig.show()

# Connect slider to update function
alpha_slider.observe(update_weights, names='value')

# Display controls
print("Adjust alpha to control the influence of comment likes on sentiment weighting:")
display(alpha_slider)
display(weight_output)

# Trigger initial calculation
update_weights(None)

Adjust alpha to control the influence of comment likes on sentiment weighting:


FloatSlider(value=0.5, continuous_update=False, description='Alpha (α):', max=10.0, readout_format='.1f', styl…

Output()

## 3. Weighted Sentiment Distribution

How does weighting by likes affect the overall sentiment distribution?

In [5]:
# Output for weighted sentiment distribution
weighted_sentiment_output = widgets.Output()

def update_weighted_sentiment(change):
	with weighted_sentiment_output:
		clear_output(wait=True)
		
		# Calculate weighted sentiment distribution
		weighted_sentiment_counts = {}
		for sentiment in sentiment_order:
			sentiment_df = df[df['sentiment'] == sentiment]
			weighted_count = sentiment_df['weight'].sum()
			weighted_sentiment_counts[sentiment] = weighted_count
		
		# Unweighted counts
		unweighted_counts = df['sentiment'].value_counts().reindex(sentiment_order, fill_value=0)
		
		# Convert to percentages
		total_weighted = sum(weighted_sentiment_counts.values())
		weighted_pct = {k: (v / total_weighted * 100) for k, v in weighted_sentiment_counts.items()}
		unweighted_pct = (unweighted_counts / unweighted_counts.sum() * 100)
		
		# Create visualizations
		fig = make_subplots(
			rows=2, cols=2,
			subplot_titles=(
				'Weighted vs Unweighted Sentiment Distribution',
				'Sentiment Distribution Shift',
				'Average Comment Likes by Sentiment',
				'Weighted Sentiment by Like Bins'
			),
			specs=[[{'type': 'bar'}, {'type': 'bar'}],
				   [{'type': 'bar'}, {'type': 'bar'}]]
		)
		
		# Plot 1: Comparison
		fig.add_trace(
			go.Bar(
				x=sentiment_order,
				y=[unweighted_pct[s] for s in sentiment_order],
				name='Unweighted',
				marker_color='lightblue',
				text=[f'{unweighted_pct[s]:.1f}%' for s in sentiment_order],
				textposition='outside'
			),
			row=1, col=1
		)
		
		fig.add_trace(
			go.Bar(
				x=sentiment_order,
				y=[weighted_pct[s] for s in sentiment_order],
				name='Weighted',
				marker_color='steelblue',
				text=[f'{weighted_pct[s]:.1f}%' for s in sentiment_order],
				textposition='outside'
			),
			row=1, col=1
		)
		
		# Plot 2: Shift
		shift = {s: weighted_pct[s] - unweighted_pct[s] for s in sentiment_order}
		colors_shift = ['green' if v > 0 else 'red' for v in shift.values()]
		
		fig.add_trace(
			go.Bar(
				x=sentiment_order,
				y=list(shift.values()),
				marker_color=colors_shift,
				showlegend=False,
				text=[f'{v:+.1f}%' for v in shift.values()],
				textposition='outside'
			),
			row=1, col=2
		)
		fig.add_hline(y=0, line_dash='dash', line_color='black', row=1, col=2)
		
		# Plot 3: Average likes by sentiment
		avg_likes_by_sentiment = df.groupby('sentiment')['comment_like_count'].mean().reindex(sentiment_order)
		
		fig.add_trace(
			go.Bar(
				x=sentiment_order,
				y=avg_likes_by_sentiment.values,
				marker_color=colors_sentiment,
				showlegend=False,
				text=[f'{v:.1f}' for v in avg_likes_by_sentiment.values],
				textposition='outside'
			),
			row=2, col=1
		)
		
		# Plot 4: Weighted sentiment by like bins
		weighted_sentiment_by_bin = {}
		for bin_name in df['like_bin'].cat.categories:
			bin_df = df[df['like_bin'] == bin_name]
			if len(bin_df) > 0:
				weighted_sentiment_by_bin[bin_name] = (bin_df['sentiment_score'] * bin_df['weight']).sum() / bin_df['weight'].sum()
			else:
				weighted_sentiment_by_bin[bin_name] = 0
		
		fig.add_trace(
			go.Bar(
				x=list(weighted_sentiment_by_bin.keys()),
				y=list(weighted_sentiment_by_bin.values()),
				marker_color='coral',
				showlegend=False,
				text=[f'{v:.3f}' for v in weighted_sentiment_by_bin.values()],
				textposition='outside'
			),
			row=2, col=2
		)
		fig.add_hline(y=0, line_dash='dash', line_color='red', row=2, col=2)
		
		# Update axes
		fig.update_xaxes(title_text='Sentiment', row=1, col=1)
		fig.update_yaxes(title_text='Percentage (%)', row=1, col=1)
		fig.update_xaxes(title_text='Sentiment', row=1, col=2)
		fig.update_yaxes(title_text='Shift (pp)', row=1, col=2)
		fig.update_xaxes(title_text='Sentiment', row=2, col=1)
		fig.update_yaxes(title_text='Avg Likes', row=2, col=1)
		fig.update_xaxes(title_text='Like Bin', row=2, col=2)
		fig.update_yaxes(title_text='Weighted Avg Sentiment', row=2, col=2)
		
		fig.update_layout(
			height=900,
			title_text=f'Weighted Sentiment Analysis (α = {alpha_slider.value})',
			showlegend=True
		)
		fig.show()
		
		# Print summary
		print(f"\nWeighted Sentiment Summary:")
		print(f"  Overall weighted sentiment: {(df['sentiment_score'] * df['weight']).sum() / df['weight'].sum():.3f}")
		print(f"  Overall unweighted sentiment: {df['sentiment_score'].mean():.3f}")
		print(f"\n  Sentiment categories gaining weight:")
		for sentiment, shift_val in sorted(shift.items(), key=lambda x: x[1], reverse=True):
			if shift_val > 0:
				print(f"    {sentiment}: +{shift_val:.2f} pp")
		print(f"\n  Sentiment categories losing weight:")
		for sentiment, shift_val in sorted(shift.items(), key=lambda x: x[1]):
			if shift_val < 0:
				print(f"    {sentiment}: {shift_val:.2f} pp")

# Connect to alpha slider
alpha_slider.observe(update_weighted_sentiment, names='value')

display(weighted_sentiment_output)
update_weighted_sentiment(None)

Output()

## 4. Weighted Stance Distribution

How does weighting by comment likes affect the stance distribution?

In [6]:
# Calculate unweighted agreement rate (for comparison)
agreement = (df['video_stance_simple'] == df['comment_stance_simple']).sum()
total = len(df)
agreement_rate = (agreement / total) * 100

# Output for weighted stance analysis
weighted_stance_output = widgets.Output()

def update_weighted_stance(change):
	with weighted_stance_output:
		clear_output(wait=True)
		
		# Calculate weighted stance distribution
		weighted_stance_counts = {}
		for stance in stance_order:
			stance_df = df[df['stance_congestion_pricing_comment'] == stance]
			weighted_count = stance_df['weight'].sum()
			weighted_stance_counts[stance] = weighted_count
		
		# Unweighted counts (for comparison)
		unweighted_counts = df['stance_congestion_pricing_comment'].value_counts().reindex(stance_order, fill_value=0)
		
		# Convert to percentages
		total_weighted = sum(weighted_stance_counts.values())
		weighted_pct = {k: (v / total_weighted * 100) for k, v in weighted_stance_counts.items()}
		unweighted_pct = (unweighted_counts / unweighted_counts.sum() * 100)
		
		# Create comparison visualization
		fig = go.Figure()
		
		fig.add_trace(
			go.Bar(
				x=stance_order,
				y=[unweighted_pct[s] for s in stance_order],
				name='Unweighted',
				marker_color='lightblue',
				text=[f'{unweighted_pct[s]:.1f}%' for s in stance_order],
				textposition='outside'
			)
		)
		
		fig.add_trace(
			go.Bar(
				x=stance_order,
				y=[weighted_pct[s] for s in stance_order],
				name='Weighted',
				marker_color='steelblue',
				text=[f'{weighted_pct[s]:.1f}%' for s in stance_order],
				textposition='outside'
			)
		)
		
		fig.update_layout(
			title=f'Weighted vs Unweighted Stance Distribution (α = {alpha_slider.value})',
			xaxis_title='Stance',
			yaxis_title='Percentage (%)',
			barmode='group',
			height=500
		)
		fig.show()
		
		# Calculate weighted stance agreement rate
		weighted_agreement = 0
		total_weight = df['weight'].sum()
		
		for idx, row in df.iterrows():
			if row['video_stance_simple'] == row['comment_stance_simple']:
				weighted_agreement += row['weight']
		
		weighted_agreement_rate = (weighted_agreement / total_weight) * 100
		
		print(f"\nStance Agreement Rates:")
		print(f"  Unweighted: {agreement_rate:.1f}%")
		print(f"  Weighted: {weighted_agreement_rate:.1f}%")
		print(f"  Difference: {weighted_agreement_rate - agreement_rate:.1f} percentage points")

# Connect to alpha slider
alpha_slider.observe(update_weighted_stance, names='value')

display(weighted_stance_output)
update_weighted_stance(None)

Output()

## 5. Weighted Tone Analysis

Which tones receive more engagement and how does weighting affect tone distribution?

In [7]:
# Output for weighted tone analysis
weighted_tone_output = widgets.Output()

def update_weighted_tone(change):
	with weighted_tone_output:
		clear_output(wait=True)
		
		# Calculate weighted tone distribution
		weighted_tone_counts = df.groupby('tone')['weight'].sum().sort_values(ascending=False)
		unweighted_tone_counts = df['tone'].value_counts()
		
		# Convert to percentages
		weighted_tone_pct = (weighted_tone_counts / weighted_tone_counts.sum() * 100)
		unweighted_tone_pct = (unweighted_tone_counts / unweighted_tone_counts.sum() * 100)
		
		# Calculate engagement factor (avg likes per tone)
		engagement_factor = df.groupby('tone')['comment_like_count'].mean().sort_values(ascending=False)
		
		# Create subplots
		fig = make_subplots(
			rows=2, cols=2,
			subplot_titles=(
				'Weighted vs Unweighted Tone Distribution',
				'Average Comment Likes by Tone',
				'Tone Distribution Shift',
				'Weighted Tone × Sentiment'
			),
			specs=[[{'type': 'bar'}, {'type': 'bar'}],
				   [{'type': 'bar'}, {'type': 'heatmap'}]]
		)
		
		# Plot 1: Weighted vs Unweighted percentages
		tones_ordered = weighted_tone_counts.index
		fig.add_trace(
			go.Bar(
				x=tones_ordered,
				y=[unweighted_tone_pct.get(t, 0) for t in tones_ordered],
				name='Unweighted',
				marker_color='lightcoral'
			),
			row=1, col=1
		)
		fig.add_trace(
			go.Bar(
				x=tones_ordered,
				y=[weighted_tone_pct[t] for t in tones_ordered],
				name='Weighted',
				marker_color='coral'
			),
			row=1, col=1
		)
		
		# Plot 2: Engagement factor
		fig.add_trace(
			go.Bar(
				x=engagement_factor.index,
				y=engagement_factor.values,
				marker_color='steelblue',
				showlegend=False
			),
			row=1, col=2
		)
		
		# Plot 3: Percentage point shift
		shift = {}
		for tone in tones_ordered:
			shift[tone] = weighted_tone_pct[tone] - unweighted_tone_pct.get(tone, 0)
		
		colors_shift = ['green' if v > 0 else 'red' for v in shift.values()]
		fig.add_trace(
			go.Bar(
				x=list(shift.keys()),
				y=list(shift.values()),
				marker_color=colors_shift,
				showlegend=False,
				text=[f'{v:+.1f}%' for v in shift.values()],
				textposition='outside'
			),
			row=2, col=1
		)
		fig.add_hline(y=0, line_dash='dash', line_color='black', row=2, col=1)
		
		# Plot 4: Weighted tone × sentiment heatmap
		weighted_tone_sentiment = []
		tones_list = list(df['tone'].unique())
		for tone in tones_list:
			row_data = []
			for sentiment in sentiment_order:
				mask = (df['tone'] == tone) & (df['sentiment'] == sentiment)
				row_data.append(df[mask]['weight'].sum())
			weighted_tone_sentiment.append(row_data)
		
		# Normalize by row
		weighted_tone_sentiment_pct = []
		for row in weighted_tone_sentiment:
			row_sum = sum(row)
			if row_sum > 0:
				weighted_tone_sentiment_pct.append([x / row_sum * 100 for x in row])
			else:
				weighted_tone_sentiment_pct.append([0] * len(sentiment_order))
		
		fig.add_trace(
			go.Heatmap(
				z=weighted_tone_sentiment_pct,
				x=sentiment_order,
				y=tones_list,
				colorscale='RdBu_r',
				zmid=20,
				text=[[f'{v:.1f}' for v in row] for row in weighted_tone_sentiment_pct],
				texttemplate='%{text}',
				textfont={'size': 8},
				showscale=False
			),
			row=2, col=2
		)
		
		# Update axes
		fig.update_xaxes(title_text='Tone', row=1, col=1)
		fig.update_yaxes(title_text='Percentage (%)', row=1, col=1)
		fig.update_xaxes(title_text='Tone', row=1, col=2)
		fig.update_yaxes(title_text='Avg Likes', row=1, col=2)
		fig.update_xaxes(title_text='Tone', row=2, col=1)
		fig.update_yaxes(title_text='Shift (pp)', row=2, col=1)
		fig.update_xaxes(title_text='Sentiment', row=2, col=2)
		fig.update_yaxes(title_text='Tone', row=2, col=2)
		
		fig.update_layout(
			height=900,
			title_text=f'Weighted Tone Analysis (α = {alpha_slider.value})',
			showlegend=True
		)
		fig.show()
		
		# Print summary
		print(f"\nTone Engagement Summary:")
		print(f"  Most engaging tone: {engagement_factor.index[0]} ({engagement_factor.iloc[0]:.1f} avg likes)")
		print(f"  Least engaging tone: {engagement_factor.index[-1]} ({engagement_factor.iloc[-1]:.1f} avg likes)")
		print(f"\n  Biggest gainers from weighting:")
		for tone, shift_val in sorted(shift.items(), key=lambda x: x[1], reverse=True)[:3]:
			print(f"    {tone}: +{shift_val:.2f} pp")
		print(f"\n  Biggest losers from weighting:")
		for tone, shift_val in sorted(shift.items(), key=lambda x: x[1])[:3]:
			print(f"    {tone}: {shift_val:.2f} pp")

# Connect to alpha slider
alpha_slider.observe(update_weighted_tone, names='value')

display(weighted_tone_output)
update_weighted_tone(None)

Output()

## 6. Weighted Sentiment Over Time

How does the temporal trend change when we weight sentiment by comment engagement?

In [8]:
# Output for weighted temporal analysis
weighted_temporal_output = widgets.Output()

def update_weighted_temporal(change):
	with weighted_temporal_output:
		clear_output(wait=True)
		
		# Calculate weighted average sentiment per month
		def weighted_avg(group):
			return (group['sentiment_score'] * group['weight']).sum() / group['weight'].sum()
		
		weighted_sentiment_time = df.groupby('comment_month').apply(weighted_avg)
		unweighted_sentiment_time = df.groupby('comment_month')['sentiment_score'].mean()
		
		# Calculate monthly influence factor (difference)
		influence_factor = weighted_sentiment_time - unweighted_sentiment_time
		
		# Calculate monthly comment counts and total likes
		monthly_stats = df.groupby('comment_month').agg({
			'comment_text': 'count',
			'comment_like_count': 'sum',
			'weight': 'sum'
		})
		monthly_stats.columns = ['comment_count', 'total_likes', 'total_weight']
		
		# Create visualizations
		fig = make_subplots(
			rows=3, cols=1,
			subplot_titles=(
				'Weighted vs Unweighted Sentiment Over Time',
				'Monthly Influence Factor (Weighted - Unweighted)',
				'Monthly Comment Volume and Total Likes'
			),
			specs=[[{'secondary_y': False}],
				   [{'secondary_y': False}],
				   [{'secondary_y': True}]]
		)
		
		# Plot 1: Weighted vs Unweighted trends
		fig.add_trace(
			go.Scatter(
				x=unweighted_sentiment_time.index,
				y=unweighted_sentiment_time.values,
				mode='lines+markers',
				name='Unweighted',
				line=dict(color='lightblue', width=2),
				marker=dict(size=6)
			),
			row=1, col=1
		)
		
		fig.add_trace(
			go.Scatter(
				x=weighted_sentiment_time.index,
				y=weighted_sentiment_time.values,
				mode='lines+markers',
				name='Weighted',
				line=dict(color='steelblue', width=2),
				marker=dict(size=6)
			),
			row=1, col=1
		)
		
		fig.add_hline(y=0, line_dash='dash', line_color='red', opacity=0.5, row=1, col=1)
		
		# Plot 2: Influence factor
		colors_influence = ['green' if v > 0 else 'red' for v in influence_factor.values]
		fig.add_trace(
			go.Bar(
				x=influence_factor.index,
				y=influence_factor.values,
				marker_color=colors_influence,
				showlegend=False,
				text=[f'{v:+.3f}' for v in influence_factor.values],
				textposition='outside',
				textfont=dict(size=8)
			),
			row=2, col=1
		)
		fig.add_hline(y=0, line_dash='dash', line_color='black', row=2, col=1)
		
		# Plot 3: Monthly volume and likes (dual axis)
		fig.add_trace(
			go.Bar(
				x=monthly_stats.index,
				y=monthly_stats['comment_count'],
				name='Comment Count',
				marker_color='lightgreen',
				opacity=0.6
			),
			row=3, col=1,
			secondary_y=False
		)
		
		fig.add_trace(
			go.Scatter(
				x=monthly_stats.index,
				y=monthly_stats['total_likes'],
				mode='lines+markers',
				name='Total Likes',
				line=dict(color='coral', width=2),
				marker=dict(size=6)
			),
			row=3, col=1,
			secondary_y=True
		)
		
		# Update axes
		fig.update_xaxes(title_text='Month', row=1, col=1)
		fig.update_yaxes(title_text='Avg Sentiment Score', row=1, col=1)
		fig.update_xaxes(title_text='Month', row=2, col=1)
		fig.update_yaxes(title_text='Influence Factor', row=2, col=1)
		fig.update_xaxes(title_text='Month', row=3, col=1)
		fig.update_yaxes(title_text='Comment Count', row=3, col=1, secondary_y=False)
		fig.update_yaxes(title_text='Total Likes', row=3, col=1, secondary_y=True)
		
		fig.update_layout(
			height=1200,
			title_text=f'Weighted Sentiment Temporal Analysis (α = {alpha_slider.value})',
			showlegend=True
		)
		fig.show()
		
		# Print summary statistics
		print(f"\nTemporal Weighting Summary:")
		print(f"  Average influence factor: {influence_factor.mean():.3f}")
		print(f"  Max positive influence: {influence_factor.max():.3f} ({influence_factor.idxmax()})")
		print(f"  Max negative influence: {influence_factor.min():.3f} ({influence_factor.idxmin()})")
		
		# Find months where weighting made biggest difference
		top_divergence = influence_factor.abs().nlargest(3)
		print(f"\n  Months with biggest weighted/unweighted divergence:")
		for month, div in top_divergence.items():
			actual_diff = influence_factor[month]
			direction = "more positive" if actual_diff > 0 else "more negative"
			print(f"    {month}: {actual_diff:+.3f} ({direction} when weighted)")

# Connect to alpha slider
alpha_slider.observe(update_weighted_temporal, names='value')

display(weighted_temporal_output)
update_weighted_temporal(None)

Output()

## 7. Weighted Analysis Summary

Comprehensive summary of key findings when weighting sentiment by comment likes

In [9]:
# Output for weighted summary
weighted_summary_output = widgets.Output()

def update_weighted_summary(change):
	with weighted_summary_output:
		clear_output(wait=True)
		
		alpha = alpha_slider.value
		
		print("=" * 70)
		print(f"WEIGHTED SENTIMENT ANALYSIS SUMMARY (α = {alpha})")
		print("=" * 70)
		
		# Overall sentiment comparison
		weighted_sentiment = (df['sentiment_score'] * df['weight']).sum() / df['weight'].sum()
		unweighted_sentiment = df['sentiment_score'].mean()
		
		print(f"\nOverall Sentiment:")
		print(f"   Unweighted average: {unweighted_sentiment:.3f}")
		print(f"   Weighted average: {weighted_sentiment:.3f}")
		print(f"   Shift: {weighted_sentiment - unweighted_sentiment:+.3f} ({((weighted_sentiment - unweighted_sentiment) / abs(unweighted_sentiment) * 100):+.1f}%)")
		
		# Top influential comments (highest weight × |sentiment_score|)
		df['influence_score'] = df['weight'] * df['sentiment_score'].abs()
		top_influential = df.nlargest(10, 'influence_score')[['comment_text', 'sentiment', 'comment_like_count', 'weight', 'influence_score']]
		
		print(f"\nTop 10 Most Influential Comments (by weight × |sentiment|):")
		for i, (idx, row) in enumerate(top_influential.iterrows(), 1):
			print(f"\n   {i}. Sentiment: {row['sentiment']}, Likes: {row['comment_like_count']}, Weight: {row['weight']:.2f}")
			comment_preview = row['comment_text'][:100] + "..." if len(row['comment_text']) > 100 else row['comment_text']
			print(f"      \"{comment_preview}\"")
		
		# Video-level weighted sentiment shifts
		video_weighted = df.groupby('video_id').apply(
			lambda x: (x['sentiment_score'] * x['weight']).sum() / x['weight'].sum()
		)
		video_unweighted = df.groupby('video_id')['sentiment_score'].mean()
		video_shift = video_weighted - video_unweighted
		
		# Merge with video titles
		video_info = df.groupby('video_id').agg({
			'video_title': 'first',
			'video_channel': 'first',
			'comment_text': 'count'
		})
		video_info['weighted_sentiment'] = video_weighted
		video_info['unweighted_sentiment'] = video_unweighted
		video_info['shift'] = video_shift
		
		print(f"\nVideos with Biggest Weighted Sentiment Shifts:")
		print(f"\n   Most Positive Shifts (popular comments more positive than average):")
		for idx, row in video_info.nlargest(3, 'shift').iterrows():
			title_short = row['video_title'][:50] + "..." if len(row['video_title']) > 50 else row['video_title']
			print(f"      {title_short}")
			print(f"         Shift: {row['shift']:+.3f} (unweighted: {row['unweighted_sentiment']:.3f} → weighted: {row['weighted_sentiment']:.3f})")
		
		print(f"\n   Most Negative Shifts (popular comments more negative than average):")
		for idx, row in video_info.nsmallest(3, 'shift').iterrows():
			title_short = row['video_title'][:50] + "..." if len(row['video_title']) > 50 else row['video_title']
			print(f"      {title_short}")
			print(f"         Shift: {row['shift']:+.3f} (unweighted: {row['unweighted_sentiment']:.3f} → weighted: {row['weighted_sentiment']:.3f})")
		
		# Stance distribution changes
		weighted_stance_dist = {}
		unweighted_stance_dist = df['stance_congestion_pricing_comment'].value_counts(normalize=True) * 100
		
		for stance in stance_order:
			stance_df = df[df['stance_congestion_pricing_comment'] == stance]
			weighted_stance_dist[stance] = stance_df['weight'].sum() / df['weight'].sum() * 100
		
		print(f"\nStance Distribution Changes:")
		for stance in stance_order:
			shift = weighted_stance_dist[stance] - unweighted_stance_dist.get(stance, 0)
			print(f"   {stance:20s}: {unweighted_stance_dist.get(stance, 0):5.1f}% → {weighted_stance_dist[stance]:5.1f}% ({shift:+.1f} pp)")
		
		# Tone distribution changes
		weighted_tone_dist = (df.groupby('tone')['weight'].sum() / df['weight'].sum() * 100).to_dict()
		unweighted_tone_dist = (df['tone'].value_counts(normalize=True) * 100).to_dict()
		
		print(f"\nTone Distribution Changes (Top 5 shifts):")
		tone_shifts = {tone: weighted_tone_dist[tone] - unweighted_tone_dist.get(tone, 0) 
					   for tone in weighted_tone_dist.keys()}
		sorted_tone_shifts = sorted(tone_shifts.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
		
		for tone, shift in sorted_tone_shifts:
			direction = "↑" if shift > 0 else "↓"
			print(f"   {tone:15s}: {shift:+.2f} pp {direction}")
		
		# Engagement statistics
		total_comments = len(df)
		total_likes = df['comment_like_count'].sum()
		comments_with_likes = (df['comment_like_count'] > 0).sum()
		
		print(f"\nEngagement Statistics:")
		print(f"   Total comments: {total_comments:,}")
		print(f"   Comments with likes: {comments_with_likes:,} ({comments_with_likes/total_comments*100:.1f}%)")
		print(f"   Total likes: {total_likes:,}")
		print(f"   Average likes per comment: {total_likes/total_comments:.2f}")
		print(f"   Average weight: {df['weight'].mean():.3f}")
		print(f"   Weight range: {df['weight'].min():.3f} to {df['weight'].max():.3f}")
		
		print("\n" + "=" * 70)

# Connect to alpha slider
alpha_slider.observe(update_weighted_summary, names='value')

display(weighted_summary_output)
update_weighted_summary(None)

Output()