# Classifying framing in videos

Our aim is to investigate whether we can determine the kind of framing, episodic or thematic, that is used in news videos. 

## Limitations

Only a small number of labeled samples are available, even less of which have been labeled by experts (as opposed to the crowd). This places a higher bound on the generalizability of our models, and makes it more challenging to train deep models. Therefor, this will serve as a proof-of-concept study.

In [1]:
## prequisites
#%pip install pandas
#%pip install numpy
#%pip install gensim
#%pip install nltk

## libraries
from collections import Counter
from math import log
from functools import reduce
import os
import os.path
import random
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
VIDEO_METADATA = DATA_DIR + "2014_metadata.csv"
VIDEO_TRANSCRIPTIONS = DATA_DIR + "2014_transcripts_months_1to4.csv"
CROWD_LABELS_FULL = DATA_DIR + "crowd_annotations_aggregated_all_videos.csv"  # old and new annotations
CROWD_LABELS_PILOT = DATA_DIR + "crowd_annotations_aggregated_pilot_videos.csv"  # subset of above for expert videos
EXPERT_LABELS = DATA_DIR + "dominant_frame_expert_labels_main_experiment_oana.csv"
DATA_NPZ = DATA_DIR + "data2021.npz"
WORDVECTORS_KV = DATA_DIR + "wordvectors2021.kv"

## load files
video_metadata = pd.read_csv(VIDEO_METADATA, delimiter=';')
video_transcriptions = pd.read_csv(VIDEO_TRANSCRIPTIONS)
crowd_labels = pd.read_csv(CROWD_LABELS_FULL)
crowd_labels_pilot = pd.read_csv(CROWD_LABELS_PILOT)
expert_labels_pilot = pd.read_csv(EXPERT_LABELS)

## download wordnet vocabulary used in preprocessing the transcriptions
nltk.download('wordnet')
nltk.download('stopwords')
stops = set(stopwords.words("english"))

[nltk_data] Downloading package wordnet to /home/xander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
    return seed
    
print(set_seed())  # make reproducable

972917174


In [3]:
# only keep relevant columns
crowd_labels = crowd_labels[['display_id', 'framing_type']]
crowd_labels_pilot = crowd_labels_pilot[['display_id', 'framing_type']]

video_transcriptions = video_transcriptions[['display_id', 'clean_text']]
video_metadata = video_metadata[['display_id', 'fulltitle', 'description', 'tags']]

# Verify Data

In [4]:
print(expert_labels_pilot.head(n=5))

print(crowd_labels_pilot.head(n=5))

print(crowd_labels.head(n=5))

    display_id framing_type
0  -2xBhpFi9JU     Thematic
1  -7xsam1-KSQ     Episodic
2  0h91y8pUuEc     Episodic
3  1eDeqiV3mnQ     Episodic
4  1r79RJzMGOQ     Episodic
    display_id framing_type
0  C9SdBZamyCE     thematic
1  5HqBX0TJV-U     thematic
2  I-b5s8ImMGE     thematic
3  yfX6U4je05g     episodic
4  e5Y_wjT-530     thematic
    display_id framing_type
0  C9SdBZamyCE     thematic
1  5HqBX0TJV-U     thematic
2  I-b5s8ImMGE     thematic
3  yfX6U4je05g     episodic
4  e5Y_wjT-530     thematic


# Preproces Data

In [5]:
## preprocess sequences
stemmer = WordNetLemmatizer()
def prep_text(s):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', s)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document if len(word) > 1]
    document = [word for word in document if word not in stops]
    doc_length = len(document)
    document = ' '.join(document)
    
    return (document, doc_length)

## drop duplicates
video_transcriptions.drop_duplicates(subset='display_id', ignore_index=True, inplace=True)
video_metadata.drop_duplicates(subset='display_id', ignore_index=True, inplace=True)

## remove rows with missing transcriptions
video_transcriptions = video_transcriptions[video_transcriptions['clean_text'].notna()]

## remove meta data entries that we don't have transcriptions for
video_ids = np.intersect1d(video_metadata['display_id'].values, video_transcriptions['display_id'].values)
video_metadata = video_metadata[video_metadata['display_id'].isin(video_ids)]
video_transcriptions = video_transcriptions[video_transcriptions['display_id'].isin(video_ids)]

assert(len(video_metadata) == len(video_transcriptions))

## process text
for index, row in tqdm(video_transcriptions.iterrows(), total=len(video_transcriptions)):
    text = row['clean_text']
    text_processed, nwords = prep_text(text)
    video_transcriptions.loc[index, 'clean_text'] = text_processed
  
for index, row in tqdm(video_metadata.iterrows(), total=len(video_metadata)):
    for label in ['fulltitle', 'description']:
        text = row[label]
        text_processed, nwords = prep_text(text)
        video_metadata.loc[index, label] = text_processed

    text = ' '.join([tag for tag in row['tags'].split('+')])
    text_processed, nwords = prep_text(text)
    video_metadata.loc[index, 'tags'] = text_processed

100%|██████████| 10493/10493 [00:45<00:00, 231.75it/s]
100%|██████████| 10493/10493 [00:14<00:00, 727.61it/s]


# Inspect data

In [6]:
print("\nStatistics\n" + '='*70)
crowd_videos_uniq = np.unique(crowd_labels['display_id'].values)
crowd_videos_pilot_uniq = np.unique(crowd_labels_pilot['display_id'].values)
expert_videos_uniq = np.unique(expert_labels_pilot['display_id'].values)
print(" - experts watched {} videos".format(expert_videos_uniq.shape[0]))
print("   {} of which are part of our 120 videos dataset".format(np.isin(expert_videos_uniq,
                                                                         video_transcriptions['display_id']).sum()))
print(" - crowd watched {} videos".format(crowd_videos_uniq.shape[0]))
print("   {} of which are part of our 120 videos dataset".format(np.isin(crowd_videos_uniq,
                                                                         video_transcriptions['display_id']).sum()))
print("   {} of which are also labeled by our experts\n\n".format(np.isin(expert_videos_uniq,
                                                                      crowd_videos_pilot_uniq).sum()))


sample = reduce(np.intersect1d, (video_metadata['display_id'].values,
                        video_transcriptions['display_id'].values,
                        expert_videos_uniq,
                        crowd_videos_pilot_uniq))[0]

print("Video Metadata\n" + '='*70)
print(video_metadata[video_metadata['display_id']==sample])

print("\nVideo Transcriptions\n" + '='*70)
print(video_transcriptions[video_transcriptions['display_id']==sample])

print("\nCrowd Labels\n" + '='*70)
print(crowd_labels[crowd_labels['display_id']==sample])

print("\nCrowd Labels Pilot\n" + '='*70)
print(crowd_labels_pilot[crowd_labels_pilot['display_id']==sample])

print("\nExpert Results\n" + '='*70)
print(expert_labels_pilot[expert_labels_pilot['display_id']==sample])



Statistics
 - experts watched 58 videos
   58 of which are part of our 120 videos dataset
 - crowd watched 120 videos
   120 of which are part of our 120 videos dataset
   58 of which are also labeled by our experts


Video Metadata
        display_id                                          fulltitle  \
10390  -2xBhpFi9JU  exporting chaos west spent billion destabilizi...   

                                             description  \
10390  despite previously supporting self determinati...   

                                                    tags  
10390  rt russia today kiev crimea anastasia churkina...  

Video Transcriptions
       display_id                                         clean_text
6575  -2xBhpFi9JU  despite previously supporting self determinati...

Crowd Labels
     display_id framing_type
12  -2xBhpFi9JU     thematic

Crowd Labels Pilot
     display_id framing_type
12  -2xBhpFi9JU     thematic

Expert Results
    display_id framing_type
0  -2xBhpFi9JU     Themati

# Datasets

Our input data consist of the concatenation of transcriptions, titles, descriptions, and tags---all of which can be acquired from the video. For our traditional machine learning models we need a 1-dimensional fixed-length input vector per sequence, which are learned using Doc2Vec. A 2-dimensional variable-width vector per sequence will be used for our deep models, for which we use pre-trained Word2Vec vectors. In all cases we use the dominant framing score as labels. We treat the expert labels as gold standard.

In [7]:
## convert text to tensor of size NUM_SEQUENCES x MAX_SEQUENCE_LENGTH x 300
def vectorize_sequences3D(sequences, sequence_length, vector_length=300):
    n = sequences.shape[0]
    a = np.zeros((n, sequence_length, vector_length))  # time on vertical axis; zero padding
    unseen = dict()
    for i, terms in enumerate(sequences):
        for j, term in enumerate(terms):
            try:
                wv = w2v_model[term][:vector_length]
            except:
                # words for which we don't have a vector get a random one
                if term not in unseen.keys():
                    unseen[term] = np.random.rand(vector_length)
                wv = unseen[term]
            
            a[i, j, :] = wv
                    
    return a

## convert text to matrix of size NUM_SEQUENCES x 300 
def vectorize_sequences2D(model, idx, vector_length=300):
    n = len(idx)
    a = np.zeros((n, vector_length))
    for i in range(n):
        a[i] = model.dv[idx[i]]
        
    return a

## learn word embeddings using Word2vec
def train_word_embeddings(sequences, vector_length=300):
    return Word2Vec(sequences, vector_size=vector_length, workers=4)

## learn sequence vectors using Doc2Vec
def train_sequence_embeddings(sequences, vector_length=300):
    corpus = list()
    for i, terms in enumerate(sequences):
        terms = [term for term in terms if len(terms) > 0]
        corpus.append(TaggedDocument(terms, [i]))
                    
    model = Doc2Vec(vector_size=vector_length, min_count=2, epochs=300)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model   

def create_splits(n):
    sample_idx = np.arange(n)
    np.random.shuffle(sample_idx)
    
    return (sample_idx[:int(n*0.8)], sample_idx[int(n*0.8):])

def mklists(*args):
    data = list()
    for a in args:
        for sentence in a:
            if len(sentence) <= 0:
                continue
            data.append(sentence.split())
                 
    return data

def mkdata(*args):
    data = [list() for i in range(args[0].shape[0])]
    for a in args:
        slc = list()
        max_length = 0
        for row in a:
            if len(row) <= 0:
                continue
                
            s = row.split()
            slc.append(s)
            if len(s) > max_length:
                max_length = len(s)

        for i, row in enumerate(slc):
            if len(row) < max_length:
                row.extend(['' for k in range(max_length-len(row))])
            data[i].extend(row)
                
    return np.array(data, dtype=object)

In [8]:
## sort using same index so video_metadate[i] matches video_transcriptions[i]
video_metadata.sort_values(by='display_id', inplace=True)
video_metadata = video_metadata.set_index('display_id')
video_transcriptions = video_transcriptions.set_index('display_id')
video_transcriptions = video_transcriptions.reindex(index=video_metadata.index)
video_transcriptions = video_transcriptions.reset_index()
video_metadata = video_metadata.reset_index()

## create mappings
idx_video_map = dict(enumerate(video_metadata['display_id'].values))  # i:display_id
video_idx_map = {display_id: i for i, display_id in idx_video_map.items()}  # display_id:i
labeled_samples_ids = np.union1d(crowd_videos_uniq, expert_videos_uniq)  # all video_ids we have annotations for
labeled_video_idx_map = {video_id: idx for video_id, idx in video_idx_map.items()
                                       if video_id in labeled_samples_ids} 
## get idx of labeled samples in unfiltered list
labeled_idx = sorted(labeled_video_idx_map.values())  # ensure natural ordering

In [9]:
# create data matrix for title
data = mkdata(video_metadata['fulltitle'].values)
print("Data matrix size: %d" % data.shape[0])

## train doc2vec
print("training Doc2Vec model")
d2v_model = train_sequence_embeddings(data, vector_length=4)

print("vectorizing 2D data")
X_2D_title = vectorize_sequences2D(d2v_model,
                                   labeled_idx,
                                   vector_length=4)


# create data matrix for transcriptions only
data = mkdata(video_transcriptions['clean_text'].values)
print("Data matrix size: %d" % data.shape[0])

## train doc2vec
print("training Doc2Vec model")
d2v_model = train_sequence_embeddings(data, vector_length=25)

print("vectorizing 2D data")
X_2D_transcriptions = vectorize_sequences2D(d2v_model,
                                            labeled_idx,
                                            vector_length=25)


# create data matrix for description only
data = mkdata(video_metadata['description'].values)
print("Data matrix size: %d" % data.shape[0])

## train doc2vec
print("training Doc2Vec model")
d2v_model = train_sequence_embeddings(data, vector_length=10)

print("vectorizing 2D data")
X_2D_descriptions = vectorize_sequences2D(d2v_model,
                                         labeled_idx,
                                         vector_length=10)

Data matrix size: 10493
training Doc2Vec model
vectorizing 2D data
Data matrix size: 10493
training Doc2Vec model
vectorizing 2D data
Data matrix size: 10493
training Doc2Vec model
vectorizing 2D data


In [10]:
# remap to subset (assumes natural ordening)
labeled_video_idx_map = {idx_video_map[idx]:i for i, idx in enumerate(labeled_idx)}

In [11]:
num_samples = X_2D_title.shape[0]
print("%d labeled samples" % num_samples)

y_experts = -np.ones(num_samples)
for display_id, idx in labeled_video_idx_map.items():
    if display_id not in expert_videos_uniq:  # not in pilot
        continue
        
    framing_type = expert_labels_pilot[expert_labels_pilot['display_id']==display_id]['framing_type'].values[0]
    framing_type = framing_type.lower()

    if framing_type == "thematic" :
        label = 0
    elif framing_type == "episodic":
        label = 1
    else:
        print("Unexpected expert label: %s" % framing_type)
        
    y_experts[idx] = label

y_crowd = -np.ones(num_samples)
for display_id, idx in labeled_video_idx_map.items():
    framing_type = crowd_labels[crowd_labels['display_id']==display_id]['framing_type'].values[0]
    framing_type = framing_type.lower()

    if framing_type == "thematic" :
        label = 0
    elif framing_type == "episodic":
        label = 1
    elif framing_type == "episodic, thematic":
        # let expert vote solve stalemate (since we don't have a gold standard for this case anyway)
        # defaults to -1 if no expert vote exists for this video
        label = y_experts[idx]
    else:
        print("Unexpected crowd label: %s" % framing_type)
        
    y_crowd[idx] = label

    
## combined set - no distinction between experts and crowd
# expert labels are favoured over crowd labels if both are provided
y_combined = np.copy(y_experts) 
copy_idx = np.where(y_combined == -1)[0]  # supplement missing expert labels with crowd labels
y_combined[copy_idx] = y_crowd[copy_idx]

NameError: name 'X_2D' is not defined

# Save data

In [None]:
np.savez_compressed(DATA_NPZ,
                    X_2D_title = X_2D_title,
                    X_2D_descriptions = X_2D_descriptions,
                    X_2D_transcriptions = X_2D_transcriptions,
                    y_crowd = y_crowd,
                    y_experts = y_experts,
                    y_combined = y_combined)

In [None]:
X_2D_title.shape

In [None]:
X_2D_transcriptions.shape

In [None]:
X_2D_descriptions.shape