# Classifying framing in videos

Our aim is to investigate whether we can determine the kind of framing, episodic or thematic, that is used in news videos. 

## Limitations

Only a small number of labeled samples are available, even less of which have been labeled by experts (as opposed to the crowd). This places a higher bound on the generalizability of our models, and makes it more challenging to train deep models. Therefor, this will serve as a proof-of-concept study.

In [1]:
## prequisites
#%pip install pandas
#%pip install numpy
#%pip install gensim
#%pip install nltk

## libraries
from collections import Counter
from math import log
import os
import os.path
import random
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
VIDEO_METADATA = DATA_DIR + "2014_metadata.csv"
VIDEO_TRANSCRIPTIONS = DATA_DIR + "2014_transcripts_months_1to4.csv"
CROWD_RESULTS = DATA_DIR + "120CSexperimentCrowdResults.csv"
CROWD_FILTERS = DATA_DIR + "crowd_data_filtered_worker_ip_and_gender_and_type_and_title.csv"
EXPERT_RESULTS = DATA_DIR + "expert_annotations_aggregated.csv"
DATA_NPZ = DATA_DIR + "data.npz"
WORDVECTORS_KV = DATA_DIR + "wordvectors.kv"

## load files
video_metadata = pd.read_csv(VIDEO_METADATA, delimiter=';')
video_transcriptions = pd.read_csv(VIDEO_TRANSCRIPTIONS)
crowd_results = pd.read_csv(CROWD_RESULTS, delimiter=';')
expert_results = pd.read_csv(EXPERT_RESULTS)
crowd_filters = pd.read_csv(CROWD_FILTERS)

## download wordnet vocabulary used in preprocessing the transcriptions
nltk.download('wordnet')
nltk.download('stopwords')
stops = set(stopwords.words("english"))

[nltk_data] Downloading package wordnet to /home/xander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def set_seed(seed=-1):
    if seed < 0:
        seed = np.random.randint(0, 2**32-1)

    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
#set_seed(47)  # make reproducable

# Preproces Data

In [3]:
## Filter crowd?
print("Crowd responses: %i" % len(crowd_results))
if True:
    good_raters = np.unique(crowd_filters['_worker_id'].values)
    crowd_results = crowd_results[crowd_results['_worker_id'].isin(good_raters)]
    print("Crowd responses remain after filtering: %i" % len(crowd_results))

Crowd responses: 1860
Crowd responses remain after filtering: 930


In [4]:
## preprocess sequences
stemmer = WordNetLemmatizer()
def prep_text(s):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', s)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    doc_length = len(document)
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join([word for word in document if word not in stops])
    
    return (document, doc_length)

## fix missing video IDs
for index, row in crowd_results.loc[crowd_results['display_id'] == '#NAME?'].iterrows():
    video_id = row['link'].lstrip('https://www.youtube.com/watch?v=')
    crowd_results.loc[index, 'display_id'] = video_id

## drop duplicates
video_transcriptions = video_transcriptions.drop_duplicates(subset='display_id', keep='last')
video_metadata = video_metadata.drop_duplicates(subset='display_id', keep='last')

## remove rows with missing transcriptions
video_transcriptions = video_transcriptions[video_transcriptions['clean_text'].notna()]

## remove meta data entries that we don't have transcriptions for
video_ids = np.intersect1d(video_metadata['display_id'].values, video_transcriptions['display_id'].values)
video_metadata = video_metadata[video_metadata['display_id'].isin(video_ids)]
video_transcriptions = video_transcriptions[video_transcriptions['display_id'].isin(video_ids)]

assert(len(video_metadata) == len(video_transcriptions))

## process text
for index, row in video_transcriptions.iterrows():
    text = row['clean_text']
    text_processed, nwords = prep_text(text)
    video_transcriptions.loc[index, 'clean_text'] = text_processed
  
for index, row in video_metadata.iterrows():
    for label in ['fulltitle', 'description']:
        text = row[label]
        text_processed, nwords = prep_text(text)
        video_metadata.loc[index, label] = text_processed

    text = ' '.join([tag for tag in row['tags'].split('+')])
    text_processed, nwords = prep_text(text)
    video_metadata.loc[index, 'tags'] = text_processed

# Inspect data

In [5]:
print("Video Metadata\n" + '='*70)
print(video_metadata.iloc[1])

print("\nVideo Transcriptions\n" + '='*70)
print(video_transcriptions.iloc[1])

print("\nCrowd Results\n" + '='*70)
print(crowd_results.iloc[1])

print("\nExpert Results\n" + '='*70)
print(expert_results.iloc[1])

print("\nStatistics\n" + '='*70)
crowd_labels_per_video = crowd_results['display_id'].value_counts().values
expert_labels_per_video = expert_results['display_id'].value_counts().values
crowd_videos_uniq = np.unique(crowd_results['display_id'].values)
expert_videos_uniq = np.unique(expert_results['display_id'].values)
print(" - experts watched {} videos ({} average labels per video)".format(expert_videos_uniq.shape[0],
                                                                          expert_labels_per_video.sum()/expert_labels_per_video.shape[0]))
print("   {} of which are part of our 120 videos dataset".format(np.isin(expert_videos_uniq,
                                                                         video_transcriptions['display_id']).sum()))
print(" - crowd watched {} videos ({} average labels per video)".format(crowd_videos_uniq.shape[0],
                                                                        crowd_labels_per_video.sum()/crowd_labels_per_video.shape[0]))
print("   {} of which are part of our 120 videos dataset".format(np.isin(crowd_videos_uniq,
                                                                         video_transcriptions['display_id']).sum()))
print("   {} of which are also labeled by our experts".format(np.isin(expert_videos_uniq,
                                                                      crowd_videos_uniq).sum()))

Video Metadata
display_id                                                    oO-k-9ZLLLk
title                               Drive it! from 22.04.2014 | Drive it!
fulltitle                                          drive 22 04 2014 drive
description             drive week mercedes truck rally racer latest a...
upload_date                                                    2014-04-23
duration                                                             1219
uploader                                                       DW English
thumbnail               https://i.ytimg.com/vi/oO-k-9ZLLLk/maxresdefau...
tags                    audi s1 audi quattro s1 racing car honda civic...
categories                                               Autos & Vehicles
average_rating                                                          5
view_count                                                            484
like_count                                                              7
dislike_count          

# Datasets

Our input data consist of the concatenation of transcriptions, titles, descriptions, and tags---all of which can be acquired from the video. For our traditional machine learning models we need a 1-dimensional fixed-length input vector per sequence, which are learned using Doc2Vec. A 2-dimensional variable-width vector per sequence will be used for our deep models, for which we use pre-trained Word2Vec vectors.

In all cases we use the framing score as labels, which are defined on a Likert scale from 1 (centainly thematic) to 7 (centainly episodic). We use a majority vote when labels for the same video differ. If no majority exists, then we take either the mid point (eg, 3 and 5 results in 4), or a random choice from the equal splits if that is not possible.

## Incremental learning

With incremental learning, we further train a model as more data becomes available. In our case, we first train a model using purely the labels provided by experts. Once trained and tested, we move over to the crowd data, and use their labels to improve the model. As experts are expensive and crowd workers are cheap, we hope to see that we can lower costs by giving our model a jump start with few but high-quality expensive labels, and then finetune it with larger amounts of lower-quality but much cheaper labels. 

We also train another model using purely the labels from the crowd, and test on those from the experts. Here, we treat the expert labels as gold standard. This experiment will give us an idea of the quality of crowd sourced labels, and whether we can do without expert annotations by accepting a (small?) loss in accuracy.

In [6]:
## convert text to tensor of size NUM_SEQUENCES x MAX_SEQUENCE_LENGTH x 300
def vectorize_sequences3D(sequences, sequence_length, vector_length=300):
    n = sequences.shape[0]
    a = np.zeros((n, sequence_length, vector_length))  # time on vertical axis; zero padding
    for i, terms in enumerate(sequences):
        nterms = len(terms)
        for j, term in enumerate(terms):
            try:
                wv = w2v_model[term][:vector_length]
            except:
                continue
            
            a[i, j, :] = wv
                    
    return a

## convert text to matrix of size NUM_SEQUENCES x 300 
def vectorize_sequences2D(sequences, model, vector_length=300):
    a = np.zeros((sequences.shape[0], vector_length))
    for i, terms in enumerate(sequences):
        a[i] = model.infer_vector([term for term in terms if len(term) > 0])
        
    return a

## learn word embeddings using Word2vec
def train_word_embeddings(sequences, vector_length=25):
    return Word2Vec(sequences, size=vector_length, workers=4)

## learn sequence vectors using Doc2Vec
def train_sequence_embeddings(sequences, train_idx, test_idx, vector_length=300):
    train_corpus = list()
    test_corpus = list()
    for i, terms in enumerate(sequences):
        terms = [term for term in terms if len(terms) > 0]
        if i in train_idx:
            train_corpus.append(TaggedDocument(terms, [i]))
        elif i in test_idx:
            test_corpus.append(terms)
            
    model = Doc2Vec(vector_size=vector_length, min_count=2, epochs=100)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model   

## return majority class per video or, if no majority, return mid point if exists, 
## else return random selection from most common score
def create_targets(video_ids, annotations):
    labels = {display_id: [] for display_id in video_ids}
    for index, row in annotations.iterrows():
        labels[row['display_id']].append(row['Frame'])
    
    targets = dict()
    for display_id, label_set in labels.items():
        ct = Counter(label_set)
        ct_max = max(ct.values())
        majority_vote = [label for label, count in ct.items() if count == ct_max]
        
        if len(majority_vote) == 1:
            targets[display_id] = majority_vote[0]
        else:  # different labels with same number of votes
            mid_point = sum(majority_vote)/len(majority_vote)
            if mid_point.is_integer():  # whole number
                targets[display_id] = int(mid_point)
            else:  # random selection
                targets[display_id] = np.random.choice(majority_vote)
                
    return targets

def create_splits(n):
    sample_idx = np.arange(n)
    np.random.shuffle(sample_idx)
    
    return (sample_idx[:int(n*0.8)], sample_idx[int(n*0.8):])

def tf_idf_ranked(sequences, top_k=25, at_least=2):
    """return top_k terms, weighted bt TF-IDF, per sequence, ordered from high to low, so 
       that index 0 of each sequence holds the most relevant term"""
    tf_list = list()
    df = dict()
    num_sequences = sequences.shape[0]
    for seq in sequences:       
        c = Counter(seq)
        n = sum(c.values())
        
        tf = dict()
        for term, freq in c.items():
            if len(term) <= 0:
                continue
                
            if freq >= at_least:
                tf[term] = freq/n
            if term not in df.keys():
                df[term] = 1
                continue
            df[term] = df[term] + 1
            
        tf_list.append(tf)
        
    idf = dict()
    for term, term_doc_freq in df.items():
        idf[term] = log(num_sequences / term_doc_freq)
        
    tf_idf_list = list()
    for tf in tf_list:
        tf_idf = dict()
        for term in tf.keys():
            tf_idf[term] = tf[term]*idf[term]
            
        tf_idf_list.append([k for k,v in sorted(tf_idf.items(), key=lambda item: item[1])[:top_k]])
    
    return np.array(tf_idf_list)

def mkdata_list(*args):
    data = [list() for i in range(args[0].shape[0])]
    for a in args:
        slc = list()
        max_length = 0
        for row in a:
            if len(row) <= 0:
                continue
                
            s = row.split()
            slc.append(s)
            if len(s) > max_length:
                max_length = len(s)
                
        for i, row in enumerate(slc):
            if len(row) < max_length:
                row.extend(['' for k in range(max_length-len(row))])
                data[i].extend(row)
                
    return np.array(data)

In [7]:
## sort using same index so video_metadate[i] matches video_transcriptions[i]
video_metadata = video_metadata.sort_values(by=['display_id'])
video_transcriptions = video_transcriptions.set_index('display_id')
video_transcriptions = video_transcriptions.reindex(index=video_metadata['display_id'])
video_transcriptions = video_transcriptions.reset_index()

## create mappings
video_idx_map = {display_id: i for i, display_id in enumerate(video_metadata['display_id'].values)}
idx_video_map = {i: display_id for display_id, i in video_idx_map.items()}
labeled_samples_ids = np.union1d(crowd_videos_uniq, expert_videos_uniq)
labeled_samples_idx = [idx for video_id, idx in video_idx_map.items()
                            if video_id in labeled_samples_ids]

## vectorize text sequences
data = mkdata_list(video_metadata['fulltitle'].values,
                   video_metadata['description'].values,
                   video_metadata['tags'].values,
                   video_transcriptions['clean_text'].values)
print("training Word2Vec model")
w2v_model = train_word_embeddings(data)

print("training Doc2Vec model")
train_idx, test_idx = create_splits(data.shape[0])
d2v_model = train_sequence_embeddings(data,
                                      train_idx=train_idx, test_idx=test_idx,
                                      vector_length=w2v_model.vector_size)

#data = tf_idf_ranked(data)  # only use most relevant words

data = data[labeled_samples_idx]  # we no longer need the unlabeled data
data_length = max(map(len, data))  # maximum row length
labeled_videos_idx_map = {idx_video_map[i]: j for j, i in enumerate(labeled_samples_idx)}
idx_labeled_videos_map = {i: display_id for display_id, i in labeled_videos_idx_map.items()}

print("vectorizing 2D data")
X_2D = vectorize_sequences2D(data,
                             d2v_model,
                             vector_length=w2v_model.vector_size)
        
print("vectorizing 3D data")
X_3D = vectorize_sequences3D(data,
                             sequence_length=data_length,
                             vector_length=w2v_model.vector_size)

training Word2Vec model
training Doc2Vec model
vectorizing 2D data
vectorizing 3D data


  wv = w2v_model[term][:vector_length]


In [14]:
num_samples = data.shape[0]

## generate labels - 7 point Likert scale
y_likert_crowd = -np.ones(num_samples)
for video_id, label in create_targets(crowd_videos_uniq, crowd_results).items():
    y_likert_crowd[labeled_videos_idx_map[video_id]] = label - 1  # 0-based
    
y_likert_experts = -np.ones(num_samples)
for _, row in expert_results.iterrows():
    y_likert_experts[labeled_videos_idx_map[row.display_id]] = row.framing_score - 1  # 0-based
    
## alternate labels - binary classification of framing type
y_dominant_crowd = -np.ones(num_samples)
for i in range(y_likert_crowd.shape[0]):
    if y_likert_crowd[i] < 0:
        continue
    if y_likert_crowd[i] < 3:
        y_dominant_crowd[i] = 0
    elif y_likert_crowd[i] > 3:
        y_dominant_crowd[i] = 1

y_dominant_experts = -np.ones(num_samples)
for _, row in expert_results.iterrows():
    framing_type = 0 if row.framing_type == "Thematic" else 1  # if episodic
    y_dominant_experts[labeled_videos_idx_map[row.display_id]] = framing_type
        
## combined set - no distinction between experts and crowd
y_likert_combined = np.copy(y_likert_experts)  # expert labels are preferred
copy_idx = np.where(y_likert_combined == -1)[0]
y_likert_combined[copy_idx] = y_likert_crowd[copy_idx]

y_dominant_combined = np.copy(y_dominant_experts)  # expert labels are preferred
copy_idx = np.where(y_dominant_combined == -1)[0]
y_dominant_combined[copy_idx] = y_dominant_crowd[copy_idx]

# Save data

In [16]:
np.savez_compressed(DATA_NPZ,
                    X_2D = X_2D,
                    X_3D = X_3D,
                    y_likert_crowd = y_likert_crowd,
                    y_likert_experts = y_likert_experts,
                    y_dominant_crowd = y_dominant_crowd,
                    y_dominant_experts = y_dominant_experts,
                    y_likert_combined = y_likert_combined,
                    y_dominant_combined = y_dominant_combined)
w2v_model.wv.save(WORDVECTORS_KV)