In [1]:
import jsonlines
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import json
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
from prettytable import PrettyTable

import statsmodels.formula.api as smf

#### Loading data 

In [2]:
climate_videos_path = "data/climate_videos_v3.jsonl"
all_sample_videos_path = "data/sampled_dataset_1percent.jsonl"
#feather_path = "data/yt_metadata_helper.feather"
nb_videos_by_cat_path = "data/nb_videos_by_cat.jsonl"


In [3]:
#video_dataset_feather = pd.read_feather(feather_path)

In [None]:
#For statistics on the entire dataset, we use the sampled dataset which will not give exact results but will be statistically relevent
all_videos_df = pd.read_json(all_sample_videos_path, lines=True)
#all_videos_df = pd.read_feather(feather_path)

In [None]:
#all_videos_df['upload_date'] = all_videos_df['upload_date'].dt.strftime('%Y-%m-%d')

In [None]:
climate_videos_df = pd.read_json(climate_videos_path, lines=True)

#### What are the categories of Climate Related videos compared to overall youtube? 

In [None]:
climate_videos_df['categories'] = climate_videos_df['categories'].replace('', 'not specified')

In [None]:
#Coutning the number of videos in each category
category_counts_climate = climate_videos_df['categories'].value_counts()
category_counts_all = nb_videos_by_cat = pd.read_json(nb_videos_by_cat_path, lines=True)
category_counts_all['categories'] = category_counts_all['categories'].replace('', 'not specified')
category_counts_all = category_counts_all.set_index('categories').squeeze()
categories = category_counts_climate.index

fig = px.pie(category_counts_climate.reset_index(),
             values='count',
             names='categories',
             title='Categories Distribution - Climate Change related Videos',
             width=800,
             height=800,
             category_orders={"categories":categories.to_list()[::-1]})
fig.update_layout(showlegend=False)
fig.update_traces( textinfo='percent+label', textposition='inside')

fig.show()

fig = px.pie(category_counts_all.reset_index(),
             values='count',
             names='categories',
             title='Categories Distribution - All Videos',
             width=800,
             height=800,
             category_orders={"categories":categories.to_list()[::-1]})
fig.update_traces(textinfo='percent+label', textposition='inside')
fig.update_layout(showlegend=False)
fig.show()


In [1]:
climate_videos_df['quarter'] = pd.to_datetime(climate_videos_df['upload_date']).dt.to_period('Q')

evolution_category_counts_climate = (climate_videos_df[['categories', 'quarter', 'display_id']].groupby(
                                                                    ['categories', 'quarter']).aggregate(
                                                                    'count')).unstack(
                                                                    fill_value=0).stack().reset_index() #add 0 values when there is no video in a category for a given time


evolution_category_counts_climate = evolution_category_counts_climate.rename(columns={'display_id': 'count'})
evolution_category_counts_climate['quarter'] = evolution_category_counts_climate['quarter'].astype(str)

fig = px.area(evolution_category_counts_climate.reset_index(), x="quarter", y="count", color="categories", category_orders={"categories":categories.to_list()[::-1]})
fig.show()

total_counts = evolution_category_counts_climate.groupby('quarter')['count'].sum() #number of videos added per time unit
evolution_category_counts_climate['proportion'] = evolution_category_counts_climate.apply(lambda row: row['count']/total_counts[row['quarter']], axis=1)

fig = px.area(evolution_category_counts_climate.reset_index(), x="quarter", y="proportion", color="categories", category_orders={"categories":categories.to_list()[::-1]})
fig.show()

NameError: name 'pd' is not defined

Based on this category analysis we choose some relevent categories (thos with more than 1000 videos related to climate change)

In [1]:
relevent_cat = category_counts_climate[category_counts_climate > 1000].reset_index()
relevent_cat = relevent_cat.drop(columns = ['count'])

NameError: name 'category_counts_climate' is not defined

One hot encoding if a video is climate change related in all_videos dataset

In [2]:
columns = ['categories', 'month', 'view_count', 'display_id', 'like_count', 'dislike_count']

obs_study_climate_videos_df = climate_videos_df.dropna(subset=['view_count', 'like_count', 'dislike_count'])
obs_study_climate_videos_df = obs_study_climate_videos_df[(obs_study_climate_videos_df['view_count'] > 0) &
                                                          (obs_study_climate_videos_df['like_count'] <= obs_study_climate_videos_df['view_count']) &
                                                          (obs_study_climate_videos_df['dislike_count'] <= obs_study_climate_videos_df['view_count'])]

obs_study_climate_videos_df['month'] = pd.to_datetime(obs_study_climate_videos_df['upload_date']).dt.to_period('M')
obs_study_climate_videos_df['is_climate'] = 1

all_videos_df['categories'] = all_videos_df['categories'].replace('', 'not specified')

obs_study_all_videos_df = all_videos_df.dropna(subset=['view_count', 'like_count', 'dislike_count'])
obs_study_all_videos_df = obs_study_all_videos_df[(obs_study_all_videos_df['view_count'] > 0) &
                                                (obs_study_all_videos_df['like_count'] <= obs_study_all_videos_df['view_count']) &
                                                (obs_study_all_videos_df['dislike_count'] <= obs_study_all_videos_df['view_count'])]

obs_study_all_videos_df['month'] = pd.to_datetime(obs_study_all_videos_df['upload_date']).dt.to_period('M')


obs_study_all_videos_df = obs_study_all_videos_df.merge(obs_study_climate_videos_df,how='left', on=obs_study_all_videos_df.columns.to_list())
obs_study_all_videos_df['is_climate'] = obs_study_all_videos_df['is_climate'].fillna(0)

obs_study_non_climate_videos_df = obs_study_all_videos_df[obs_study_all_videos_df['is_climate'] == 0]

columns = columns + ['is_climate']

obs_study_climate_videos_df = obs_study_climate_videos_df[columns]
obs_study_non_climate_videos_df = obs_study_all_videos_df[columns]


NameError: name 'climate_videos_df' is not defined

### Different view counts ?

In [None]:
matching_df = obs_study_climate_videos_df.merge(right = obs_study_non_climate_videos_df,
                                                on = ['categories', 'month'],
                                                suffixes=('_climate', '_non_climate'))

matching_df = matching_df.groupby(['categories','display_id_climate', 'view_count_climate', 'is_climate_climate']).agg({
    'view_count_non_climate' : 'mean' ,
    'display_id_non_climate' : (lambda x : ",".join(x)),
    'is_climate_non_climate' : (lambda x : 0)
}
).reset_index()

matching_df = matching_df.reset_index()


In [None]:
climate_matched = matching_df[['categories','display_id_climate', 'view_count_climate', 'is_climate_climate']]
cols = ['categories','display_id', 'view_count', 'is_climate']

climate_matched = climate_matched.rename(columns = {old : new for old,new in zip(climate_matched.columns, cols)})

non_climate_matched = matching_df[['categories','display_id_non_climate', 'view_count_non_climate', 'is_climate_non_climate']]
non_climate_matched = non_climate_matched.rename(columns = {old : new for old,new in zip(non_climate_matched.columns, cols)})


obs_study_df = pd.concat([climate_matched,non_climate_matched])

In [None]:
display(obs_study_df)

In [None]:
categories = []
intercepts = []
coefficients = []
intercept_pvals = []
coefficient_pvals = []
coef_colors = []

for category in relevent_cat['categories']:
    # Fit the model for each category
    model = smf.ols(formula='view_count ~ is_climate', data=obs_study_df[obs_study_df['categories'] == category])
    result = model.fit()
    
    print(result.summary())

    # Retrieve the intercept and coefficient
    intercept, coef = result.params
    
    # Check p-values for statistical relevance
    intercept_p, coef_p = result.pvalues
    
    # Determine if values are statistically relevant
    intercept_val = "Not statistically relevant" if intercept_p >= 0.05 else intercept
    coef_val = "Not statistically relevant" if coef_p >= 0.05 else coef
    
    # Store the results and color
    categories.append(category)
    intercepts.append(intercept)
    coefficients.append(coef)
    intercept_pvals.append(intercept_p)
    coefficient_pvals.append(coef_p)

print(coefficient_pvals)

sizes = [100 for _ in range(len(categories))] # Replace with actual values representing the size of each bubble

# Create the bubble plot
plt.scatter(categories, coefficients, s = sizes ,alpha=0.5)  # 's' determines the size of each bubble

# Customize the plot
plt.title('Bubble Plot of is_climate Coefficient per Category')
plt.xlabel('Category')
plt.ylabel('Coefficient of is_climate')
plt.axhline(0, color='grey', lw=0.5)  # Add a line at y=0 for reference
plt.xticks(rotation=25)  # Rotate category names for better readability

# Optional: Add a color scale or other features to represent additional dimensions

# Show the plot
plt.tight_layout()  # Adjust the layout for a better fit
plt.show()

Observations: 
- Different predominant categories, as expected
- Main for climate change: N&P, Education, Science & Tech 

**We study like/dislikes comparison metrics**


$$SLDR(N_{like}, N_{dislike}) =  (-1)^{\delta} \dfrac{\max{(N_{like} , N_{dislike}) + 1}}{\min{(N_{like} , N_{dislike} ) +1 } }$$

### Matching

In [None]:
obs_study_climate_videos_df = obs_study_climate_videos_df.sort_values('view_count')
obs_study_climate_videos_df['view_count'] = obs_study_climate_videos_df['view_count'].astype(int)
obs_study_climate_videos_df['view_count_climate'] = obs_study_climate_videos_df['view_count']

obs_study_non_climate_videos_df = obs_study_non_climate_videos_df.sort_values('view_count')
obs_study_non_climate_videos_df['view_count'] = obs_study_non_climate_videos_df['view_count'].astype(int)
obs_study_non_climate_videos_df['view_count_non_climate'] = obs_study_non_climate_videos_df['view_count']

not_matched_climate_videos_df = obs_study_climate_videos_df #climate videos who were not matched (all of them for now)
not_matched_non_climate_videos_df = obs_study_non_climate_videos_df #non climate videos who were not matched (all of them for now)

matching_df = pd.DataFrame()

print(not_matched_climate_videos_df.shape[0])

while not_matched_climate_videos_df.shape[0] > 0: #purpose is to matched as many climate videos as possible

    not_matched_climate_videos_df = not_matched_climate_videos_df.sort_values('view_count') #sorting to be able to use merge_asof
    not_matched_non_climate_videos_df = not_matched_non_climate_videos_df.sort_values('view_count')


    matching_2_df = pd.merge_asof(
                                left = not_matched_climate_videos_df,
                                right = not_matched_non_climate_videos_df, 
                                on='view_count', #nearest match on view count
                                by=['categories', 'month'], #exact match on categories and month
                                suffixes=('_climate', '_non_climate'), direction = 'nearest')
    
    matching_2_df['relative_diff'] = (matching_2_df['view_count_climate'] - matching_2_df['view_count_non_climate']).abs()/matching_2_df[['view_count_non_climate', 'view_count_climate']].max(axis=1)
                                            #view_count relative differences in the match we found
    
    #allowing only matches with samll enough view count relative_diff
    matching_2_df = matching_2_df.sort_values('relative_diff')
    matching_2_df = matching_2_df[matching_2_df['relative_diff'] < 0.1]

    #Since several_climate_videos may have been matched with a single non_climate_videos, we only keep the best match for each duplicates 
    matching_2_df_filtered = matching_2_df.drop_duplicates(subset = 'display_id_non_climate', keep ='first')

    #not matched dfs are updated: climate_videos who were droped when we droped the duplicates above.
    not_matched_climate_videos_df = not_matched_climate_videos_df[~not_matched_climate_videos_df['display_id'].isin(matching_2_df_filtered['display_id_climate'])
                                                            & not_matched_climate_videos_df['display_id'].isin(matching_2_df['display_id_climate'])]
    #non_climate_videos who were not yet used for a match
    not_matched_non_climate_videos_df = not_matched_non_climate_videos_df[~not_matched_non_climate_videos_df['display_id'].isin(matching_2_df['display_id_non_climate'])]
    
    print(not_matched_climate_videos_df.shape[0])

    #updating the matching
    matching_df = pd.concat([matching_df, matching_2_df_filtered])

climate_ids = matching_df['display_id_climate']
non_climate_ids = matching_df['display_id_non_climate']
matching_df = matching_df.drop(columns= ['view_count', 'relative_diff'])

In [None]:
display(matching_df)

In [None]:
climate_cols = [name for name in matching_df.columns if 'non_climate' not in name]
non_climate_cols = ['categories','month'] +  [name for name in matching_df.columns if 'non_climate'  in name]

climate_matched = matching_df[climate_cols]
climate_matched = climate_matched.rename(columns = {old : new for old,new in zip(sorted(climate_matched.columns), sorted(columns))})

non_climate_matched = matching_df[non_climate_cols]
non_climate_matched = non_climate_matched.rename(columns = {old : new for old,new in zip(sorted(non_climate_matched.columns), sorted(columns))})

obs_study_df = pd.concat([climate_matched,non_climate_matched])

In [None]:

def sldr(like_count, dislike_count):
    mask = (like_count == 0) & (dislike_count == 0)
    result = np.where(
        mask,
        1,
        np.where(
            like_count > dislike_count,
            (like_count + 1) / (dislike_count + 1),
            - (dislike_count + 1) / (like_count + 1)
        )
    )
    return result
      

In [None]:
obs_study_df['like_prop'] = obs_study_df['like_count']/obs_study_df['view_count'] * 100
obs_study_df['dislike_prop'] = obs_study_df['dislike_count']/obs_study_df['view_count'] * 100
obs_study_df['engagement_rate'] = obs_study_df['like_prop'] + obs_study_df['dislike_prop']
obs_study_df['sldr'] = sldr(obs_study_df['like_count'], obs_study_df['dislike_count']) 
obs_study_df['dislike_to_like'] = np.where(
                    (obs_study_df['like_count'] == 0) & (obs_study_df['dislike_count'] == 0),
                    0.5,
                    obs_study_df['dislike_count']/(obs_study_df['like_count'] + obs_study_df['dislike_count'])) * 100

In [None]:
for categorie in relevent_cat['categories']:
    print("\n\n\n")
    print(categorie)
    print("\n")    
    model=smf.ols(formula='dislike_to_like ~ is_climate',data=obs_study_df[obs_study_df['categories'] == categorie])
    result=model.fit()

    print(result.summary())

    # Scatter plot with regression line
    sns.lmplot(x='is_climate', y='dislike_to_like', data=obs_study_df, ci=None)
    plt.title('Scatter Plot with Regression Line')
    plt.xlabel('Is Climate')
    plt.ylabel('Engagement Rate')
    plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(5, 6))
fig.subplots_adjust(hspace=0.1)  # adjust space between axes

# plot the same data on both axes
ax1.boxplot([climate_videos_df['sldr'].dropna(), video_dataset_feather['sldr'].dropna()], showfliers=False)
ax2.boxplot([climate_videos_df['sldr'].dropna(), video_dataset_feather['sldr'].dropna()], showfliers=False)

# zoom-in / limit the view to different portions of the data
ax1.set_title('SLDR Distribution')
ax1.set_ylim(1, 70)  # outliers only
y_ticks_high = list(range(1, 70, 10))
y_ticks_high.append(1)
ax1.set_yticks(y_ticks_high)  # Set y-axis ticks

ax2.set_ylim(-40, -1)  # most of the data

# hide the spines between ax and ax2
ax1.spines.bottom.set_visible(False)
ax2.spines.top.set_visible(False)
ax1.xaxis.tick_top()
y_ticks_low  = list(range(-40, 0, 10))
y_ticks_low.append(-1)
ax2.set_yticks(y_ticks_low)
ax1.tick_params(labeltop=False)  # don't put tick labels at the top
ax2.xaxis.tick_bottom()
#ax1.text(1, climate_mean, f"Mean: {climate_mean:.2f}", ha='center', va='bottom', color='red')
#ax1.text(2, all_mean, f"Mean: {all_mean:.2f}", ha='center', va='bottom', color='red')

ax1.set_xticklabels(['','','Climate Videos', 'All Videos'])

# Now, let's turn towards the cut-out slanted lines.
# We create line objects in axes coordinates, in which (0,0), (0,1),
# (1,0), and (1,1) are the four corners of the axes.
# The slanted lines themselves are markers at those locations, such that the
# lines keep their angle and position, independent of the axes size or scale
# Finally, we need to disable clipping.

d = 0  # proportion of vertical to horizontal extent of the slanted line
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
              linestyle="none", color='k', mec='k', mew=1, clip_on=False)

ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)






In [None]:
# Convert 'SLDR' column to numeric type
climate_sldr_plot = pd.to_numeric(climate_videos_df['sldr'].dropna())
all_sldr_plot = pd.to_numeric(all_videos_df['sldr'].dropna())

# Create a violin plot
plt.figure(figsize=(8, 6))

sns.boxplot(
    data=[climate_sldr_plot, all_sldr_plot],
    palette="Set3",
    showfliers=False
)
plt.ylim(-40, 85)
plt.title('Comparison of SLDR between Climate Videos and All Videos')
plt.xticks([0, 1], ['Climate Videos', 'All Videos'])
plt.ylabel('SLDR')
plt.show()


We can see that the like ratio is lower for climate videos. Hence they are more dabatable. And it goes much lower...

Let's now study by categories

In [None]:
plt.figure(figsize=(16, 20))  # Adjust the figure size as desired

num_categories = len(categories)
num_columns = num_categories // 4  # Number of columns in the subplot grid
num_rows = num_categories // 4   # Number of rows in the subplot grid

for i, category in enumerate(categories):
    plt.subplot(num_rows, num_columns, i+1)
    
    ndlis_all = all_videos_df.query(f"categories == '{category}'")['sldr'].dropna()
    ndlis_climate = climate_videos_df.query(f"categories == '{category}'")['sldr'].dropna()
    
    plt.boxplot([ndlis_all, ndlis_climate], showfliers=False)
    
    plt.xlabel('Videos')
    plt.ylabel('sldr')
    plt.title(f'Box Plot of SLDR for {category}')
    
    plt.grid(axis='y', which='major', linestyle='--', linewidth=0.5, alpha=0.7)
    
    # Add x labels
    plt.xticks([1, 2], ['All Videos', 'Climate Videos'])

plt.tight_layout()
plt.show()

In [None]:
for df in [climate_videos_df, all_videos_df]:
    df['upload_date'] = pd.to_datetime(df['upload_date'])
    df['year_month'] = df['upload_date'].dt.to_period('Y')
    average_like_ratio = df.groupby('year_month')['sldr'].mean()
    average_like_ratio.plot(kind='line', figsize=(10, 6))

plt.xlabel('Year-Month')
plt.ylabel('Average SLDR')
plt.title('Average SLDR Over Time')

# Add legend
plt.legend(['Climate Videos', 'All Videos'])

plt.show()


In [None]:
plt.figure(figsize=(5, 6))
plt.boxplot([all_videos_df['engagement_rate'].dropna(),climate_videos_df['engagement_rate'].dropna()], showfliers=False)
plt.ylabel('Engagement Rate')
plt.title('Box Plot of Engagement Rate') 
#plt.grid(axis='y', which='major', linestyle='--', linewidth=0.5, alpha=0.7)
    
# Add x labels
plt.xticks([1, 2], ['All Videos', 'Climate Videos'])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 20))  # Adjust the figure size as desired

num_categories = len(categories)
num_columns = num_categories // 4  # Number of columns in the subplot grid
num_rows = num_categories // 4   # Number of rows in the subplot grid

for i, category in enumerate(categories):
    plt.subplot(num_rows, num_columns, i+1)
    
    engagement_rate_all = all_videos_df.query(f"categories == '{category}'")['engagement_rate'].dropna()
    engagement_rate_climate = climate_videos_df.query(f"categories == '{category}'")['engagement_rate'].dropna()
    
    plt.boxplot([engagement_rate_all, engagement_rate_climate], showfliers=False)
    plt.ylabel('Engagement Rate')
    plt.title(category)
    
    plt.grid(axis='y', which='major', linestyle='--', linewidth=0.5, alpha=0.7)
    
    # Add x labels
    plt.xticks([1, 2], ['All Videos', 'Climate Videos'])

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 6))
plt.boxplot([all_videos_df['like_percentage'].dropna(), climate_videos_df['like_percentage'].dropna()], showfliers=False)
plt.ylabel('Like Percentage')
plt.title('Box Plot of Like Percentage') 
#plt.grid(axis='y', which='major', linestyle='--', linewidth=0.5, alpha=0.7)
    
# Add x labels
plt.xticks([1, 2], ['All Videos', 'Climate Videos'])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 20))  # Adjust the figure size as desired

num_categories = len(categories)
num_columns = num_categories // 4  # Number of columns in the subplot grid
num_rows = num_categories // 4   # Number of rows in the subplot grid

for i, category in enumerate(categories):
    plt.subplot(num_rows, num_columns, i+1)
    
    like_percentage_all = all_videos_df.query(f"categories == '{category}'")['like_percentage'].dropna()
    like_percentage_climate = climate_videos_df.query(f"categories == '{category}'")['like_percentage'].dropna()
    
    plt.boxplot([like_percentage_all, like_percentage_climate], showfliers=False)
    plt.ylabel('Like Percentage')
    plt.title(category)
    
    plt.grid(axis='y', which='major', linestyle='--', linewidth=0.5, alpha=0.7)
    
    # Add x labels
    plt.xticks([1, 2], ['All Videos', 'Climate Videos'])

plt.tight_layout()
plt.show()

Climate concerned people growing, but climato skeptical vids ?