## From chunk-level judgments to time-matched ratings

Zizhuang Miao

This script is used to map chunk-level judgments (e.g., annotations on indoor and outdoor scenes for each sentence) to word-level annotations so that they are comparable with other features. The specific method for matching is nearest neighbor -- we find the word closest to each sample of continuous ratings (of social interactions, ToM, and multi-person presence), and then find the start and end word of each chunk in that time series. All words in between will bear the same annotations.

This is how we got all features matched at the time points of online ratings.

Below we provide example codes with indoor versus ourdoor annotations.

### Indoor/Outdoor

In [1]:
import pandas as pd
import numpy as np
import os

In [None]:
dataDir = ""    # directory where the online ratings with word matched to each time point
outputDir = ""

ratings = pd.read_csv('')    # the chunk-level ratings file with 'Story' and 'indoor' columns, with each row being a sentence
chunk_idx = 0    # index the chunk in the ratings file

for n in range(1, 9):
    df = pd.read_csv(os.path.join(dataDir, f"narrative{n}.csv"))
    df['indoor'], df['outdoor'] = np.nan, np.nan

    chunk_words = ratings.loc[chunk_idx, 'Story'].split()
    chunk_words = [word.strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""").lower() for word in chunk_words]     # remove all punctuations for chunk_words
    n_words = len(chunk_words)

    first_idx, last_idx = 0, 0
    
    while not first_idx >= len(df):    # going through all the words in the df
        df_idx = first_idx    # where in the df we are currently at
        word_list = [(df_idx, df.loc[df_idx, 'word'].lower())]    # a list of new words and their index
        while len(word_list) < n_words:
            df_idx += 1
            if df_idx == len(df):    # if we reach the end of the df, we break
                break
            if df.loc[df_idx, 'word'].lower() != df.loc[df_idx-1, 'word'].lower():
                word_list.append((df_idx, df.loc[df_idx, 'word'].lower()))
        
        # now we have a list of words whose number match the chunk_words, or who are the final words in the df
        # we check if some words in the chunk_words are not in the word_list
        if word_list[-1][1].strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""") != chunk_words[-1]:
            # some words are missing
            # if the last word in the chunk_words is in the word_list, find it
            if chunk_words[-1] in [w.strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""") for _, w in word_list]:
                for i in range(len(word_list)-1, -1, -1):
                    if word_list[i][1].strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""") == chunk_words[-1] and (not word_list[i][1] == word_list[i-1][1]):
                        last_idx = word_list[i+1][0] - 1
                        break
            # if not, find the second last word in the chunk_words
            elif chunk_words[-2] in [w.strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""") for _, w in word_list]:
                for i in range(len(word_list)-1, -1, -1):
                    if word_list[i][1].strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""") == chunk_words[-2] and (not word_list[i][1] == word_list[i-1][1]):
                        last_idx = word_list[i+1][0] - 1
                        break

        else:
            # we have a match
            # find the end index of the chunk
            if df_idx < len(df) - 1:
                df_idx += 1
                while df.loc[df_idx, 'word'].lower() == df.loc[df_idx-1, 'word'].lower():
                    if df_idx < len(df) - 1:
                        df_idx += 1
                    else:
                        break
                last_idx = df_idx - 1 if df_idx < len(df) - 1 else df_idx  # the last index of the chunk, before the next new word
            else:
                last_idx = df_idx
        
        # now we have the first and last index of the chunk in the df, we can assign the ratings to the df
        df.loc[first_idx:last_idx, 'indoor'] = ratings.loc[chunk_idx, 'indoor']
        df.loc[first_idx:last_idx, 'outdoor'] = 1 - ratings.loc[chunk_idx, 'indoor']
        chunk_idx += 1    # move to the next chunk
        if chunk_idx >= len(ratings):
            break    # we have reached the end of the ratings file
        chunk_words = ratings.loc[chunk_idx, 'Story'].split()
        chunk_words = [word.strip("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""").lower() for word in chunk_words]     # remove all punctuations for chunk_words
        n_words = len(chunk_words)
        first_idx = last_idx + 1    # move to the next chunk in the df
    
    # save the df to a csv file
    df.to_csv(os.path.join(outputDir, f"narrative{n}.csv"), index=False)