In [None]:
import os
import random
import json
import numpy as np
import pandas as pd
import seaborn as sns
from copy import copy
from matplotlib import pyplot as plt
import matplotlib.ticker as mtick
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import shap

In [None]:
os.environ["PATH"] += os.pathsep + 'C:/Users/coimbravieira/Graphviz/bin'

In [None]:
def set_dim_fig(sizes=(16,20,20)):
    SMALL_SIZE = sizes[0]
    MEDIUM_SIZE = sizes[1]
    BIGGER_SIZE = sizes[2]
    
    plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

In [None]:
set_dim_fig()

In [None]:
def plot_cdf(df_stats, cols=[], color=None, xlab="Number of videos watched", ylab="Proportion of users", note=""):
       
    df_stats[cols].plot.hist(figsize=(7,5), cumulative=True, density=1, bins=100, histtype='step', lw=4, alpha=0.6, color=color)

    plt.legend(loc=(0.4,0.05))
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.savefig(f"figs/cdf-{xlab}-{note}.pdf", bbox_inches='tight')
    plt.show()

In [None]:
def plot_bar(df_stats, cols=[], color=None, percentage=False, xlab="Number of videos watched", ylab="Proportion of users", k=None, bar="v", stacked=True, figsize=(10,20), note=""):
    
    df_stats = df_stats[cols]
    
    if k:
        df_stats = df_stats[:k]
    
    if bar == "h":
        if len(cols) == 1:
            df_stats.plot.barh(figsize=figsize, stacked=stacked, legend=False)
        else:
            df_stats.plot.barh(figsize=figsize, stacked=stacked)
    else:
        if len(cols) == 1:
            df_stats.plot.bar(figsize=(figsize[1],figsize[0]), stacked=stacked, legend=False)
        else:
            df_stats.plot.bar(figsize=(figsize[1],figsize[0]), stacked=stacked)

    #plt.legend(loc="upper right")
    plt.xlabel(xlab)
    plt.ylabel(ylab)

    if percentage:
        plt.gca().set_yticklabels([f'{x/100:.0%}' for x in plt.gca().get_yticks()]) 

    ylab = ylab.replace("\n","")
    plt.savefig(f"figs/bar-{ylab}-{note}.pdf", bbox_inches='tight')
    plt.show()

In [None]:
def plot_hist_count_perc (df_all, col='percentage_watched', title='Histogram of the Percentage of Video Watched'):
    # Plot settings
    fig, ax1 = plt.subplots(figsize=(8, 4))
    
    # Plot histogram with counts
    counts, bins, patches = ax1.hist(df_all[col], bins=100, alpha=0.9, color='purple', label=False)
    ax1.set_xlabel('Percentage of the video watched')
    ax1.set_ylabel('Count')
    
    # Create secondary y-axis for percentage
    ax2 = ax1.twinx()
    total = counts.sum()
    percentages = counts / total * 100
    ax2.plot(bins[:-1], percentages, 'w-', linewidth=0.00001, label=False)
    ax2.set_ylabel('Percentage (%)')
    
    plt.title(title)
    plt.tight_layout()
    plt.savefig(f"figs/hist_{col}.pdf", bbox_inches='tight')
    plt.show()

In [None]:
def random_model (df_all_rf):
    X2 = df_all_rf["Video ID playlist"]
    y2 = df_all_rf["watched_until_end"]
    
    _, _, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
    
    p0 = y2_train.value_counts(normalize=True)[0]
    p1 = y2_train.value_counts(normalize=True)[1]
    # Generate predictions with the best model
    random.seed(42)
    y2_pred = pd.Series([0 if random.random() <= p0 else 1 for i in y2_test])
    
    accuracy2 = accuracy_score(y2_test, y2_pred)
    precision2 = precision_score(y2_test, y2_pred, average="macro")
    recall2 = recall_score(y2_test, y2_pred, average="macro")
    F12 = f1_score(y2_test, y2_pred, average="macro")
    
    print("Accuracy:", accuracy2)
    print("Precision:", precision2)
    print("Recall:", recall2)
    print("F1:", F12)
    
    # Create the confusion matrix
    cm2 = confusion_matrix(y2_test, y2_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm2).plot(cmap=plt.cm.Oranges)
    os.makedirs("figs/classification", exist_ok=True)
    plt.savefig("figs/classification/cm-random.pdf", bbox_inches='tight')
    plt.show()

In [None]:
def deduplicate_by_mode(df, target_col="watched_until_end"):
    # Get the columns to group by (all except the target column)
    group_cols = [col for col in df.columns if col != target_col]
    
    # Group by those columns and aggregate the target column using mode
    df_dedup = (
        df.groupby(group_cols, as_index=False)[target_col]
          .agg(lambda x: x.mode().iloc[0])  # take the most frequent value
    )
    
    return df_dedup

In [None]:
# Helper function to extract prefix from feature name
def get_prefix(name):
    return name.split('_')[0]

def randomforest_tiktok(X2_train, X2_test, y2_train, y2_test, shap_set=None, note=None):
    # --- Save original feature names ---
    feature_names = X2_train.columns

    # --- Save original class names ---
    class_names = {0:"< 100%", 1:"100%"}

    # --- Standardize the Data ---
    sc = StandardScaler()
    X2_train_scaled = sc.fit_transform(X2_train)
    X2_test_scaled = sc.transform(X2_test)

    # --- Re-wrap scaled arrays as DataFrames to preserve column names ---
    X2_train_scaled_df = pd.DataFrame(X2_train_scaled, columns=feature_names)
    X2_test_scaled_df = pd.DataFrame(X2_test_scaled, columns=feature_names)

    print("Train size:", X2_train_scaled_df.shape)
    print("Test size:", X2_test_scaled_df.shape)
    
    # --- Random Search Parameters ---
    param_dist = {
        "max_depth": range(1, 100),
        "min_samples_leaf": range(2, 20),
        "n_estimators": range(10, 100, 10)
    }

    rf = RandomForestClassifier(class_weight = "balanced", random_state=42)

    rand_search = RandomizedSearchCV(
        rf,
        param_distributions=param_dist,
        n_iter=10,
        cv=5,
        random_state=42,
        refit=True
    )

    # --- Fit Model ---
    rand_search.fit(X2_train_scaled_df, y2_train)
    y2_pred = rand_search.predict(X2_test_scaled_df)

    # --- Evaluation ---
    cm = confusion_matrix(y2_test, y2_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap=plt.cm.Oranges)
    os.makedirs("figs/classification", exist_ok=True)
    plt.savefig(f"figs/classification/cm-{note}.pdf", bbox_inches='tight')
    plt.show()

    accuracy = accuracy_score(y2_test, y2_pred)
    precision = precision_score(y2_test, y2_pred, average="macro")
    recall = recall_score(y2_test, y2_pred, average="macro")
    f1 = f1_score(y2_test, y2_pred, average="macro")

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)

    # Compute AUC-ROC
    y2_scores = rand_search.predict_proba(X2_test_scaled_df)[:, 1]
    auc = roc_auc_score(y2_test, y2_scores)
    print(f"AUC-ROC: {auc:.3f}")
    
    # Compute ROC curve points
    fpr, tpr, thresholds = roc_curve(y2_test, y2_scores)
    
    # Plot ROC curve
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0, 1], [0, 1], 'k--')  # diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

    # Tune threshold to maximize F1 score
    best_f1 = 0
    best_thresh = 0.5
    for thresh in np.linspace(0, 1, 101):
        y_pred_thresh = (y2_scores >= thresh).astype(int)
        f1 = f1_score(y2_test, y_pred_thresh)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    
    print(f"Best F1: {best_f1:.3f} at threshold {best_thresh:.2f}")
    # Final prediction using best threshold
    y2_pred_final = (y2_scores >= best_thresh).astype(int)


    # --- Feature Importances ---
    best_rf = rand_search.best_estimator_
    feature_importances = pd.Series(best_rf.feature_importances_, index=feature_names)

    grouped_importances_sum = feature_importances.groupby(get_prefix).sum()
    grouped_importances_max = feature_importances.groupby(get_prefix).max()

    if len(feature_names) > 10:
        feature_importances.sort_values(ascending=False).head(50).plot.bar(figsize=(15, 5))
        plt.savefig(f"figs/classification/fi-{note}.pdf", bbox_inches='tight')
        plt.show()
        grouped_importances_sum.sort_values(ascending=False).plot.bar(figsize=(15, 5), title="Grouped Feature Importances (sum)")
        plt.show()
        grouped_importances_max.sort_values(ascending=False).plot.bar(figsize=(15, 5), title="Grouped Feature Importances (max)")
        plt.show()
    else:
        feature_importances.sort_values(ascending=True).plot.barh(figsize=(5, 5))
        plt.savefig(f"figs/classification/fi-{note}.pdf", bbox_inches='tight')
        plt.show()
        grouped_importances_sum.sort_values(ascending=False).plot.barh(figsize=(5, 5), title="Grouped Feature Importances (sum)")
        plt.show()
        grouped_importances_max.sort_values(ascending=False).plot.barh(figsize=(5, 5), title="Grouped Feature Importances (max)")
        plt.show()

    if shap_set == "test":
        X2_train_or_test_scaled_df = X2_test_scaled_df
        note = "testset-" + note

    elif shap_set == "train":
        X2_train_or_test_scaled_df = X2_train_scaled_df
        note = "trainset-" + note

    else:
        return rand_search

    # --- SHAP Analysis with new API and beeswarm plots ---
    explainer = shap.TreeExplainer(best_rf)
    shap_values = np.array(explainer.shap_values(X2_train_or_test_scaled_df))
    #print(shap_values.shape) # This returns a shap.Explanation object

    #shap_values_class_0 = shap_values[:, :, 0]  # SHAP values for class 0
    #shap_values_class_1 = shap_values[:, :, 1]  # SHAP values for class 1
    #shap_values_class_2 = shap_values[:, :, 2]  # SHAP values for class 2

    # If multiclass: shap_values[i] contains class-specific contributions
    for i in range(shap_values.shape[2]):  # Loop over number of classes
        print(f"Generating SHAP beeswarm plot for class {i}")
        plt.figure()
        shap.summary_plot(shap_values[:, :, i], X2_train_or_test_scaled_df, show=False)
        plt.title("Class " + str(i) + ":\nUsers who watched " + class_names[i] + " of the video duration")
        plt.savefig(f"figs/classification/shap-{note}-class-{i}.pdf", bbox_inches='tight')
        plt.show()

    return rand_search

## Dataset

In [None]:
df_all = pd.read_csv("data/data_survey&trace&interests_dummy.csv", index_col=0)
df_all

# EXPLORATORY ANALYSIS

In [None]:
set_dim_fig((12,14,14))
plot_hist_count_perc (df_all, col='percentage_watched', title='Histogram of the Percentage of Video Watched')
plot_hist_count_perc (df_all, col='watched_until_end', title='Histogram of the Video Watched until the end vs. not')
set_dim_fig()

In [None]:
df_all

In [None]:
df_p_watch = pd.DataFrame(df_all[["watched_until_end", "tt_account"]].groupby("tt_account").sum())
df_p_watch["watched"] = pd.DataFrame(df_all[["watched_until_end", "tt_account"]].groupby("tt_account").count())
df_p_watch.describe()

In [None]:
plot_cdf(df_p_watch, cols=["watched_until_end", "watched"], color=None, 
         xlab="Number of videos watched", ylab="Proportion of users", 
         note="playlist-final")

In [None]:
df_v_watch = pd.DataFrame(df_all[["watched_until_end", "video_id_playlist_final"]].groupby("video_id_playlist_final").sum())
df_v_watch["watched"] = pd.DataFrame(df_all[["watched_until_end", "video_id_playlist_final"]].groupby("video_id_playlist_final").count())
df_v_watch.describe()

In [None]:
plot_cdf(df_v_watch, cols=["watched_until_end", "watched"], color=None, 
         xlab="Number of users", ylab="Proportion of videos", 
         note="playlist-final")

In [None]:
df_v = df_all[["video_id_playlist_final", "video_duration"]].drop_duplicates().set_index("video_id_playlist_final")
df_v.index.name = None
df_v

In [None]:
plot_cdf(df_v, cols=["video_duration"], color="green",
         xlab="Video duration (seconds)", ylab="Proportion of videos", 
         note="playlist-final")

In [None]:
df_v = df_v.merge(df_v_watch, left_index=True, right_index=True) #getting the number of participants who watched & until the end each video
df_v["%watched_until_end"] = df_v_watch["watched_until_end"] / df_v_watch["watched"] * 100
df_v

In [None]:
df_v.plot.scatter("video_duration", "%watched_until_end") #percentage of users who watched the video until the end
#plt.title("Correlation between video duration \nand the percentage of participants who \nwatched the video until the end: " + str(round(df_v_watch["video_duration"].corr(df_v_watch["%watched_until_end"]), 2)))
plt.xlabel("Video duration (seconds)")
plt.ylabel("Percentage of participants \n who watched the video \n until the end")
plt.text(300, 70, "Correlation:" + str(round(df_v["video_duration"].corr(df_v["%watched_until_end"]), 2)), size=14)
plt.gca().set_yticklabels([f'{x/100:.0%}' for x in plt.gca().get_yticks()]) 
plt.savefig(f"figs/scatter-video-duration-perc_watched.pdf", bbox_inches='tight')
plt.show()

In [None]:
df_all.plot.scatter("video_duration", "percentage_watched") #percentage of the video watched
plt.xlabel("Video duration (seconds)")
plt.ylabel("Percentage of the \n video watched")
plt.text(300, 90, "Correlation = " + str(round(df_all["video_duration"].corr(df_all["percentage_watched"]), 2)), size=14)
plt.gca().set_yticklabels([f'{x/100:.0%}' for x in plt.gca().get_yticks()]) 
plt.savefig(f"figs/scatter-video-duration-perc_watched-video.pdf", bbox_inches='tight')
plt.show()

In [None]:
set_dim_fig((14,20,20))
plot_bar(df_stats=df_v, cols=["video_duration"], bar="v",
         xlab="Video ID", ylab="Video duration (seconds)", note="order", figsize=(5,20))
set_dim_fig()

In [None]:
set_dim_fig((14,20,20))
plot_bar(df_stats=df_v, cols=["%watched_until_end"], bar="v", percentage=True,
         xlab="Video ID", ylab="Percentage of participants who \n watched the video until the end", note="order", figsize=(5,20))
set_dim_fig()

In [None]:
df_all[["SEQUENTIAL_ID", "tt_account"]].drop_duplicates().sort_values(by="SEQUENTIAL_ID")

In [None]:
df_heatmap_aux = df_all[["video_id_playlist_final", "tt_account", "percentage_watched"]]
df_heatmap = pd.pivot_table(df_heatmap_aux, values="percentage_watched", index="tt_account",
                       columns="video_id_playlist_final")
df_heatmap.fillna(-1, inplace=True)
df_heatmap

In [None]:
df_heatmap.reset_index(inplace=True, drop=True)
# reset the index and add 1 to each value
df_heatmap.index = df_heatmap.index + 1
df_heatmap

In [None]:
set_dim_fig((14,20,20))

my_cmap = copy(plt.cm.RdBu_r)
my_cmap.set_under("gray")

fig, ax = plt.subplots(figsize=(20, 10))

sns.heatmap(
    df_heatmap, 
    cmap=my_cmap, 
    vmin=0, 
    vmax=100, 
    ax=ax,
    cbar_kws=dict(
        label='Percentage of the video watched',
        use_gridspec=False,
        location="right",
        pad=0.008
    )
)
plt.xlabel("Video ID")
plt.yticks(rotation=0)
plt.ylabel("Participant ID")

# Format colorbar ticks as percentages
cbar = ax.collections[0].colorbar
cbar.ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.0f%%'))

# Manually adjust colorbar width only
#pos = cbar.ax.get_position()
#cbar.ax.set_position([pos.x0, pos.y0, 0.01, pos.height])  # Width only

plt.savefig(f"figs/heatmap.pdf", bbox_inches="tight")
plt.show()
set_dim_fig()

## Comparison with TikTok statistics

In [None]:
df_tiktok_stats = pd.read_csv("data/Prolific_groups.csv")
df_tiktok_stats

In [None]:
df_prolific_groups = df_all[["prolific_group", "tt_account"]].drop_duplicates().groupby(by="prolific_group").count()
df_prolific_groups["Participants"] = df_prolific_groups["tt_account"]/df_prolific_groups["tt_account"].sum()*100
df_prolific_groups

In [None]:
stats_tt = list(df_tiktok_stats["%Female"][:5]) + list(df_tiktok_stats["%Male"][:5])
df_prolific_groups["TikTok statistics 2023"] = stats_tt
df_prolific_groups

In [None]:
print("Total participants: ", df_prolific_groups["tt_account"].sum())
plot_bar(df_stats=df_prolific_groups, cols=["Participants","TikTok statistics 2023"], bar="v", stacked=False, 
         xlab="Sex and age group", ylab="Percentage of users", figsize=(5,10))

# CLASSIFICATION

# TikTok Experiment data

## Feature selection

In [None]:
# One-hot encode the SEQUENTIAL_ID column
one_hot_user_df = pd.get_dummies(df_all['SEQUENTIAL_ID'], prefix='User ID')

df_all_rf = df_all.drop(columns=['Date', 'Link','video_id','in_playlist','in_playlist_final','video_id_playlist', 'watching_time',
                                 'percentage_watched_float','percentage_watched', 'prolific_group','SEQUENTIAL_ID',
                                 'tt_account','like','comment', 'common_interests','all_interests', 'num_topics_video', 
                                 'jacc_original_interests', 'video_saves', 'watched_6s+'])

df_all_rf.rename(columns={"jacc_interests": "Interest Similarity", 
                         'video_id_playlist_final': "Video ID playlist",
                         'video_duration': "Video duration",
                         'video_likes': "Video num. likes",
                         'video_shares': "Video num. shares",
                         'video_comments': "Video num. comments",
                         'video_plays': "Video num. plays"}, inplace=True)

## Random model

In [None]:
random_model(df_all_rf)

In [None]:
print(df_all_rf.drop(columns=["watched_until_end"]).shape, df_all_rf.drop(columns=["watched_until_end"]).drop_duplicates().shape)

## Model all features

In [None]:
columns_rf = [c for c in df_all_rf.columns.tolist() if c != 'watched_until_end']

In [None]:
X2_all = df_all_rf[columns_rf]
y2_all = df_all_rf["watched_until_end"]

In [None]:
# Split the data into training and test sets
X2_train_all, X2_test_all, y2_train_all, y2_test_all = train_test_split(X2_all, y2_all, test_size=0.2, random_state=42)

In [None]:
set_dim_fig((12,14,20))
rand_search_all = randomforest_tiktok(X2_train_all, X2_test_all, y2_train_all, y2_test_all, note="model_all")
#rand_search_all = randomforest_tiktok(X2_train_all, X2_test_all, y2_train_all, y2_test_all, shap_set="train", note="model_all")
#rand_search_all = randomforest_tiktok(X2_train_all, X2_test_all, y2_train_all, y2_test_all, shap_set="test", note="model_all")

## Model video metadata

In [None]:
columns_meta = ["Video duration", 
                "Video ID playlist", 
                "Video num. plays", 
                "Video num. comments", 
                "Video num. shares", 
                "Video num. likes"]

df_meta_rf = df_all_rf[columns_meta + ["watched_until_end"]]
df_meta_rf = df_meta_rf.merge(one_hot_user_df, left_index=True, right_index=True) # adding one hot encoding user ID"

In [None]:
df_meta_rf

In [None]:
X2_meta = df_meta_rf.drop(columns=['watched_until_end'])
y2_meta = df_meta_rf["watched_until_end"]

In [None]:
# Split the data into training and test sets
X2_train_meta, X2_test_meta, y2_train_meta, y2_test_meta = train_test_split(X2_meta, y2_meta, test_size=0.2, random_state=42)

In [None]:
set_dim_fig((20,20,20))
rand_search_meta = randomforest_tiktok(X2_train_meta, X2_test_meta, y2_train_meta, y2_test_meta, note="model_meta")
#rand_search_meta = randomforest_tiktok(X2_train_meta, X2_test_meta, y2_train_meta, y2_test_meta, shap_set="train", note="model_meta")
#rand_search_meta = randomforest_tiktok(X2_train_meta, X2_test_meta, y2_train_meta, y2_test_meta, shap_set="test", note="model_meta")

# TikTok real world data

In [None]:
df_real_world = pd.read_csv("data/data_donation_for_classification_allvideosNorthCentral America.csv", index_col=0)
df_real_world

## Selecting only the first K videos watched by each user

In [None]:
K=104
df_real_world = df_real_world[df_real_world["video_id_playlist"] <= K]
df_real_world.reset_index(inplace=True, drop=True)

In [None]:
# One-hot encode the SEQUENTIAL_ID column
one_hot_user_real_df = pd.get_dummies(df_real_world['email_md5'], prefix='User ID')

df_real_world = df_real_world.drop(columns=['percentage_watched', 'liked', 'watched_6s+'])

df_real_world.rename(columns={'video_id_playlist': "Video ID playlist", 
                              'video_duration': "Video duration",
                              'video_digg_count': "Video num. likes",
                              'video_share_count': "Video num. shares",
                              'video_comment_count': "Video num. comments",
                              'video_play_count': "Video num. plays"}, inplace=True)			

In [None]:
df_real_world = df_real_world[columns_meta + ["watched_until_end"]]
df_real_world = df_real_world.merge(one_hot_user_real_df, left_index=True, right_index=True) # adding one hot encoding user ID"

In [None]:
X2_real_world_meta = df_real_world.drop(columns=['watched_until_end'])
y2_real_world = df_real_world["watched_until_end"]

In [None]:
print(X2_real_world_meta.shape, X2_real_world_meta.drop_duplicates().shape)

In [None]:
# Split the data into training and test sets
X2_train_real_world_meta, X2_test_real_world_meta, y2_train_real_world, y2_test_real_world = train_test_split(X2_real_world_meta, y2_real_world, test_size=0.2, random_state=42)

In [None]:
set_dim_fig((20,20,20))
rand_search_real_world_meta = randomforest_tiktok(X2_train_real_world_meta, X2_test_real_world_meta, y2_train_real_world, y2_test_real_world, note="model_real_world_meta")
#rand_search_real_world_meta = randomforest_tiktok(X2_train_real_world_meta, X2_test_real_world_meta, y2_train_real_world, y2_test_real_world, shap_set="train", note="model_real_world_meta")
#rand_search_real_world_meta = randomforest_tiktok(X2_train_real_world_meta, X2_test_real_world_meta, y2_train_real_world, y2_test_real_world, shap_set="test", note="model_real_world_meta")

# Summary Data for Classification: Real World vs. Experiment

In [None]:
print("******** Real World Data ********")
print("Users: 108 \nVideos: 6091")
print("Classification matrix:", X2_real_world_meta.shape)
print(y2_real_world.value_counts(), y2_real_world.value_counts(normalize=True))

In [None]:
print("******** Experiment data ********")
print("Users: 80 \nVideos: 105")
print("Classification matrix (all features):", X2_all.shape)
print("Classification matrix (video metadata):", X2_meta.shape)
print(y2_meta.value_counts(), y2_meta.value_counts(normalize=True))

In [None]:
df_all_rf[["Video ID playlist", "watched_until_end", "Video duration"]].groupby(by=["Video ID playlist", "watched_until_end"]).count()