# Classifying framing in videos

Our aim is to investigate whether we can determine the kind of framing, episodic or thematic, that is used in news videos. 

## Limitations

Only a small number of labeled samples are available, even less of which have been labeled by experts (as opposed to the crowd). This places a higher bound on the generalizability of our models, and makes it more challenging to train deep models. Therefor, this will serve as a proof-of-concept study.

In [3]:
## prequisites
#%pip install pandas
#%pip install numpy

## libraries
from collections import Counter
from math import log
import os
import os.path
import random
import re
import pandas as pd
import numpy as np

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
CROWD_RESULTS = DATA_DIR + "120CSexperimentCrowdResults.csv"
CROWD_FILTERS = DATA_DIR + "crowd_data_filtered_worker_ip_and_gender_and_type_and_title.csv"
EXPERT_RESULTS = DATA_DIR + "expert_annotations_aggregated.csv"
DATA_NPZ = DATA_DIR + "sequences_preprocessed.npz"
TARGETS_NPZ = DATA_DIR + "targets.npz"

## load files
crowd_results = pd.read_csv(CROWD_RESULTS, delimiter=';')
expert_results = pd.read_csv(EXPERT_RESULTS)
crowd_filters = pd.read_csv(CROWD_FILTERS)

data = np.load(DATA_NPZ, allow_pickle=True)
video_idx = data['video_idx']

In [4]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
#set_seed(47)  # make reproducable

# Preproces Data

In [5]:
## Filter crowd?
print("Crowd responses: %i" % len(crowd_results))
if True:
    good_raters = np.unique(crowd_filters['_worker_id'].values)
    crowd_results = crowd_results[crowd_results['_worker_id'].isin(good_raters)]
    print("Crowd responses remain after filtering: %i" % len(crowd_results))

Crowd responses: 1860
Crowd responses remain after filtering: 930


In [6]:
## fix missing video IDs
for index, row in crowd_results.loc[crowd_results['display_id'] == '#NAME?'].iterrows():
    video_id = row['link'].lstrip('https://www.youtube.com/watch?v=')
    crowd_results.loc[index, 'display_id'] = video_id

# Inspect data

In [8]:
print("\nCrowd Results\n" + '='*70)
print(crowd_results.iloc[1])

print("\nExpert Results\n" + '='*70)
print(expert_results.iloc[1])

print("\nStatistics\n" + '='*70)
crowd_labels_per_video = crowd_results['display_id'].value_counts().values
expert_labels_per_video = expert_results['display_id'].value_counts().values
crowd_videos_uniq = np.unique(crowd_results['display_id'].values)
expert_videos_uniq = np.unique(expert_results['display_id'].values)
print(" - experts watched {} videos".format(expert_videos_uniq.shape[0]))
print(" - crowd watched {} videos ({} average labels per video)".format(crowd_videos_uniq.shape[0],
                                                                        crowd_labels_per_video.sum()/crowd_labels_per_video.shape[0]))
print("   {} of which are also labeled by our experts".format(np.isin(expert_videos_uniq,
                                                                      crowd_videos_uniq).sum()))


Crowd Results
_trust                                                            0.5357
_worker_id                                                      45527958
_country                                                             EGY
_region                                                                5
_city                                                              Tanta
_ip                                                       197.246.228.58
gender                                                              male
VideoPortionWatched                               only_part_of_the_video
Trustworthiness                                                        4
Emotion                                                                5
VideoCategory                                                 world_news
Frame                                                                  4
PersonalStory                                                        yes
NoteworthyKeywords     NASA suspends

In [10]:
## return majority class per video or, if no majority, return mid point if exists, 
## else return random selection from most common score
def create_targets(video_ids, annotations):
    labels = {display_id: [] for display_id in video_ids}
    for index, row in annotations.iterrows():
        labels[row['display_id']].append(row['Frame'])
    
    targets = dict()
    for display_id, label_set in labels.items():
        ct = Counter(label_set)
        ct_max = max(ct.values())
        majority_vote = [label for label, count in ct.items() if count == ct_max]
        
        if len(majority_vote) == 1:
            targets[display_id] = majority_vote[0]
        else:  # different labels with same number of votes
            mid_point = sum(majority_vote)/len(majority_vote)
            if mid_point.is_integer():  # whole number
                targets[display_id] = int(mid_point)
            else:  # random selection
                targets[display_id] = np.random.choice(majority_vote)
                
    return targets

def create_splits(n):
    sample_idx = np.arange(n)
    np.random.shuffle(sample_idx)
    
    return (sample_idx[:int(n*0.8)], sample_idx[int(n*0.8):])

In [13]:
## create mappings
video_idx_map = {display_id: i for i, display_id in enumerate(video_idx)}
idx_video_map = {i: display_id for display_id, i in video_idx_map.items()}
labeled_samples_ids = np.union1d(crowd_videos_uniq, expert_videos_uniq)
labeled_samples_idx = [idx for video_id, idx in video_idx_map.items()
                            if video_id in labeled_samples_ids]

In [15]:
num_samples = video_idx.shape[0]

## generate labels - 7 point Likert scale
y_likert_crowd = -np.ones(num_samples)
for video_id, label in create_targets(crowd_videos_uniq, crowd_results).items():
    y_likert_crowd[video_idx_map[video_id]] = label - 1  # 0-based
    
y_likert_experts = -np.ones(num_samples)
for _, row in expert_results.iterrows():
    y_likert_experts[video_idx_map[row.display_id]] = row.framing_score - 1  # 0-based
    
## alternate labels - binary classification of framing type
y_dominant_crowd = -np.ones(num_samples)
for i in range(y_likert_crowd.shape[0]):
    if y_likert_crowd[i] < 0:
        continue
    if y_likert_crowd[i] < 3:
        y_dominant_crowd[i] = 0
    elif y_likert_crowd[i] > 3:
        y_dominant_crowd[i] = 1

y_dominant_experts = -np.ones(num_samples)
for _, row in expert_results.iterrows():
    framing_type = 0 if row.framing_type == "Thematic" else 1  # if episodic
    y_dominant_experts[video_idx_map[row.display_id]] = framing_type
        
## combined set - no distinction between experts and crowd
y_likert_combined = np.copy(y_likert_experts)  # expert labels are preferred
copy_idx = np.where(y_likert_combined == -1)[0]
y_likert_combined[copy_idx] = y_likert_crowd[copy_idx]

y_dominant_combined = np.copy(y_dominant_experts)  # expert labels are preferred
copy_idx = np.where(y_dominant_combined == -1)[0]
y_dominant_combined[copy_idx] = y_dominant_crowd[copy_idx]

# Save data

In [17]:
np.savez_compressed(TARGETS_NPZ,
                    target_idx = labeled_samples_idx,
                    y_likert_crowd = y_likert_crowd,
                    y_likert_experts = y_likert_experts,
                    y_dominant_crowd = y_dominant_crowd,
                    y_dominant_experts = y_dominant_experts,
                    y_likert_combined = y_likert_combined,
                    y_dominant_combined = y_dominant_combined)