In [1]:
## prequisites
#%pip install pandas
#%pip install numpy
#%pip install nltk

## libraries
from collections import Counter
from math import log
import os
import os.path
import random
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
VIDEO_METADATA = DATA_DIR + "2014_metadata.csv"
VIDEO_TRANSCRIPTIONS = DATA_DIR + "2014_transcripts_months_1to4.csv"
DATA_NPZ = DATA_DIR + "sequences_preprocessed.npz"

## load files
video_metadata = pd.read_csv(VIDEO_METADATA, delimiter=';')
video_transcriptions = pd.read_csv(VIDEO_TRANSCRIPTIONS)

## download wordnet vocabulary used in preprocessing the transcriptions
nltk.download('wordnet')
nltk.download('stopwords')
stops = set(stopwords.words("english"))

[nltk_data] Downloading package wordnet to /home/xander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
## preprocess sequences
stemmer = WordNetLemmatizer()
def prep_text(s):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', s)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    doc_length = len(document)
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join([word for word in document if word not in stops])
    
    return (document, doc_length)

## drop duplicates
video_transcriptions = video_transcriptions.drop_duplicates(subset='display_id', keep='last')
video_metadata = video_metadata.drop_duplicates(subset='display_id', keep='last')

## remove rows with missing transcriptions
video_transcriptions = video_transcriptions[video_transcriptions['clean_text'].notna()]

## remove meta data entries that we don't have transcriptions for
video_ids = np.intersect1d(video_metadata['display_id'].values, video_transcriptions['display_id'].values)
video_metadata = video_metadata[video_metadata['display_id'].isin(video_ids)]
video_transcriptions = video_transcriptions[video_transcriptions['display_id'].isin(video_ids)]

assert(len(video_metadata) == len(video_transcriptions))

## process text
for index, row in video_transcriptions.iterrows():
    text = row['clean_text']
    text_processed, nwords = prep_text(text)
    video_transcriptions.loc[index, 'clean_text'] = text_processed
  
for index, row in video_metadata.iterrows():
    for label in ['fulltitle', 'description']:
        text = row[label]
        text_processed, nwords = prep_text(text)
        video_metadata.loc[index, label] = text_processed

    text = ' '.join([tag for tag in row['tags'].split('+')])
    text_processed, nwords = prep_text(text)
    video_metadata.loc[index, 'tags'] = text_processed

In [3]:
def mkdata_list(*args):
    data = [list() for i in range(args[0].shape[0])]
    for a in args:
        for i, row in enumerate(a):
            if len(row) <= 0:
                continue

            data[i].extend(row.split())
                
    return np.array(data)

In [4]:
## sort using same index so video_metadate[i] matches video_transcriptions[i]
video_metadata = video_metadata.sort_values(by=['display_id'])
video_transcriptions = video_transcriptions.set_index('display_id')
video_transcriptions = video_transcriptions.reindex(index=video_metadata['display_id'])
video_transcriptions = video_transcriptions.reset_index()

## mapping display_id to index
video_idx = video_metadata['display_id'].values

## create mappings
video_idx_map = {display_id: i for i, display_id in enumerate(video_metadata['display_id'].values)}
idx_video_map = {i: display_id for display_id, i in video_idx_map.items()}

## create dataset
data = mkdata_list(video_metadata['fulltitle'].values,
                   video_metadata['description'].values,
                   video_metadata['tags'].values,
                   video_transcriptions['clean_text'].values)

In [11]:
np.savez_compressed(DATA_NPZ, sequences=data, video_idx=video_idx)