In [1]:
## prequisites
#%pip install pandas
#%pip install numpy

## libraries
from collections import Counter
from math import log
import os
import os.path
import random
import re
import pandas as pd
import numpy as np

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
VIDEO_METADATA = DATA_DIR + "2014_metadata.csv"
VIDEO_TRANSCRIPTIONS = DATA_DIR + "2014_transcripts_months_1to4.csv"
DATA_NPZ = DATA_DIR + "sequences_raw.npz"

## load files
video_metadata = pd.read_csv(VIDEO_METADATA, delimiter=';')
video_transcriptions = pd.read_csv(VIDEO_TRANSCRIPTIONS)

In [2]:
## drop duplicates
video_transcriptions = video_transcriptions.drop_duplicates(subset='display_id', keep='last')
video_metadata = video_metadata.drop_duplicates(subset='display_id', keep='last')

## remove rows with missing transcriptions
video_transcriptions = video_transcriptions[video_transcriptions['clean_text'].notna()]

## remove meta data entries that we don't have transcriptions for
video_ids = np.intersect1d(video_metadata['display_id'].values, video_transcriptions['display_id'].values)
video_metadata = video_metadata[video_metadata['display_id'].isin(video_ids)]
video_transcriptions = video_transcriptions[video_transcriptions['display_id'].isin(video_ids)]

assert(len(video_metadata) == len(video_transcriptions))

In [3]:
## sort using same index so video_metadate[i] matches video_transcriptions[i]
video_metadata = video_metadata.sort_values(by=['display_id'])
video_transcriptions = video_transcriptions.set_index('display_id')
video_transcriptions = video_transcriptions.reindex(index=video_metadata['display_id'])
video_transcriptions = video_transcriptions.reset_index()

## mapping display_id to index
video_idx = video_metadata['display_id'].values

In [4]:
np.savez_compressed(DATA_NPZ, 
                    video_idx=video_idx,
                    titles=video_metadata['fulltitle'].values,
                    descriptions=video_metadata['description'].values,
                    tags=video_metadata['tags'].values,
                    transcriptions=video_transcriptions['clean_text'].values)