In [1]:
import csv
import pandas as pd
import numpy as np
import random

from google.colab import files

uploaded = files.upload()

Saving A_Christmas_Carol.xlsx to A_Christmas_Carol.xlsx
Saving Collision_of_Worlds_1.xlsx to Collision_of_Worlds_1.xlsx
Saving Collision_of_Worlds_2.xlsx to Collision_of_Worlds_2.xlsx
Saving Collision_of_Worlds_3.xlsx to Collision_of_Worlds_3.xlsx
Saving Collision_of_Worlds_4.xlsx to Collision_of_Worlds_4.xlsx
Saving Collision_of_Worlds_5.xlsx to Collision_of_Worlds_5.xlsx
Saving Falling_Sentences.xlsx to Falling_Sentences.xlsx
Saving Frankenstein.xlsx to Frankenstein.xlsx
Saving In_Amundsen’s_Tent.xlsx to In_Amundsen’s_Tent.xlsx
Saving Leiningen_Versus_the_Ants.xlsx to Leiningen_Versus_the_Ants.xlsx
Saving Observer_1-A_Warm_Home.xlsx to Observer_1-A_Warm_Home.xlsx
Saving Observer_2-Charity.xlsx to Observer_2-Charity.xlsx
Saving Observer_3-One_of_Us.xlsx to Observer_3-One_of_Us.xlsx
Saving Observer_4-Legends.xlsx to Observer_4-Legends.xlsx
Saving Prince_and_the_Pauper.xlsx to Prince_and_the_Pauper.xlsx
Saving The_Demon_King.xlsx to The_Demon_King.xlsx
Saving The_Exiled_Queen.xlsx to Th

In [53]:
stories = {}
dataset = []
NEIGHBORHOOD_RADIUS = 4

for file_name, file_data in uploaded.items():
    try:
        df = pd.read_excel(file_data, header=None)

        # Check for annotation
        if 1 not in df[0].values:
          print(f"No scene transition found in {file_name}, skipping...")
          continue

        # Find the index of the first '1' in the first column
        first_scene_transition_index = df[df[0] == 1].index[0]

        # fill any missed entries with 0
        df[0] = df[0].fillna(0)

         # Remove rows where the sentence is empty
        df = df.dropna(subset=[1])
        df = df[df[1] != 0]
        df = df[df[1] != '0']

        # # Slice the DataFrame to keep only the rows from the first '1' onwards
        df = df.iloc[first_scene_transition_index:].reset_index(drop=True)

        stories[file_name] = df
    except Exception as e:
        print(f"Error reading {file_name}: {e}")



No scene transition found in Prince_and_the_Pauper.xlsx, skipping...
No scene transition found in The_Legend_of_Sleepy_Hollow.xlsx, skipping...
No scene transition found in There_Will_Come_Soft_Rains.xlsx, skipping...


In [56]:
def generate_sentence_neighborhoods(df, radius=NEIGHBORHOOD_RADIUS):

    """
    Generates sentence neighborhoods centered around transition points in a DataFrame.

    Parameters:
    - df (pandas.DataFrame) : Assumption is that the first row is a scene transition
    - radius : Number of sentences around the transition to include (before offsets)

    Returns:
    - list of pandas.DataFrame: A list containing the neighborhoods as DataFrames with one entry. A flattening of the neighborhood

    Notes:
    - For adjacent transitions (or story beginnings) only one neighborhood in the direction opposite to the conflict is returned
    - Only neighborhoods that contain at least one sentence before or after the transition (not including
      the transition itself) are included in the output.
    """

    neighborhoods = []
    length = ( radius * 2 ) + 1

    transition_indices = df.index[df[0] == 1].tolist()  # Scene transition indices

    for idx in transition_indices:

      # These are the number of rows I can crawl up and down without hitting another transition
      max_prev_distance = get_distance_to_prev_transition(df, idx) - 1
      max_next_distance = get_distance_to_next_transition(df, idx) - 1

      # Indices of next and previous transitions
      prev_transition_index = idx - get_distance_to_prev_transition(df, idx)
      next_transition_index = idx + get_distance_to_next_transition(df, idx)

      # Define the start and end indices of the neighborhood
      start_idx = idx - min(radius, max_prev_distance, max_next_distance)
      end_idx = idx + min(radius, max_prev_distance, max_next_distance)

      # Offset is where we define where we place the transition in the quartile, currently it is centered, rounding down
      neighborhood_size = end_idx - start_idx
      offset = neighborhood_size // 3

      # In cases with adjacent transitions this bonus allows us to grab at least one neighborhood where our transition is on the edge and we go out one radius length away from the other transition
      if (max_next_distance == 0):
        backward_bonus = radius
      else :
        backward_bonus = 0
      if (max_prev_distance == 0):
        forward_bonus = radius
      else :
        forward_bonus = 0
      if (max_next_distance == 0 and max_prev_distance == 0):
        continue

      # Truncate this neighborhood if our desired offset captures another transition
      if (prev_transition_index >= (start_idx - offset)):
        last_third_start = prev_transition_index + 1
      else :
        last_third_start = start_idx - offset

      if (next_transition_index <= (end_idx + offset + 1)):
        first_third_end = next_transition_index - 1
      else :
        first_third_end = end_idx + offset

      # Capture neighborhoods
      first_third = df.iloc[start_idx + offset : first_third_end + 1 + forward_bonus]
      mid_third = df.iloc[start_idx : end_idx + 1]
      last_third = df.iloc[last_third_start - backward_bonus: end_idx - offset + 1]

      # Make sure the transition isn't alone, then flatten
      if (first_third[0] == 0).any():
        neighborhoods.append(flatten(pad_neighborhood(first_third, length)))

      if (mid_third[0] == 0).any():
        neighborhoods.append(flatten(pad_neighborhood(mid_third, length)))

      if (last_third[0] == 0).any():
        neighborhoods.append(flatten(pad_neighborhood(last_third, length)))

    return neighborhoods


def get_distance_to_prev_transition(df, index) :
  if index == 0:      # Start of story
        return 1
  for i in range(index - 1, -1, -1):
      if df.iloc[i, 0] == 1:
          return index - i
  return None

def get_distance_to_next_transition(df, index) :
    for i in range(index + 1, len(df)):
        if df.iloc[i, 0] == 1:
            return i - index
    return len(df) - index # Count the end of the story as the next scene transition

def pad_neighborhood(neighborhood, length):
    rows_to_add = length - len(neighborhood)

    # If the DataFrame needs padding
    if rows_to_add > 0:
        padding = pd.DataFrame(np.nan, index=range(rows_to_add), columns=neighborhood.columns)
        neighborhood = pd.concat([neighborhood, padding], ignore_index=True)

    return neighborhood

def flatten(neighborhood):
    filtered_sentences = neighborhood[1].dropna().tolist()

    # Join the sentences into a single string with spaces
    concatenated_sentences = ' '.join(filtered_sentences)

    # Create and return DataFrame with a single entry
    return pd.DataFrame([concatenated_sentences])

In [55]:
all_neighborhoods = []

for file_name, df in stories.items():
    print(f"Processing {file_name}...")
    try:
        neighborhoods = generate_sentence_neighborhoods(df, NEIGHBORHOOD_RADIUS)

        all_neighborhoods.extend(neighborhoods)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Concatenate all neighborhood DataFrames into a single DataFrame
if all_neighborhoods:
    neighborhoods_df = pd.concat(all_neighborhoods, ignore_index=True)

    neighborhoods_df.to_csv('all_neighborhoods.csv', index=False)
    print("All neighborhoods have been exported to 'all_neighborhoods.csv'.")
else:
    print("No neighborhoods to export.")

Processing A_Christmas_Carol.xlsx...
Processing Collision_of_Worlds_1.xlsx...
Processing Collision_of_Worlds_2.xlsx...
Processing Collision_of_Worlds_3.xlsx...
Processing Collision_of_Worlds_4.xlsx...
Processing Collision_of_Worlds_5.xlsx...
Processing Falling_Sentences.xlsx...
Processing Frankenstein.xlsx...
Processing In_Amundsen’s_Tent.xlsx...
Processing Leiningen_Versus_the_Ants.xlsx...
Processing Observer_1-A_Warm_Home.xlsx...
Processing Observer_2-Charity.xlsx...
Processing Observer_3-One_of_Us.xlsx...
Processing Observer_4-Legends.xlsx...
Processing The_Demon_King.xlsx...
Processing The_Exiled_Queen.xlsx...
Processing The_Human_Chair.xlsx...
Processing The_Landlady.xlsx...
Processing The_Man_In_The_Well.xlsx...
Processing The_Monkey_s_Paw.xlsx...
Processing The_Most_Dangerous_Game.xlsx...
Processing The_Necklace.xlsx...
Processing The_Night_Rider.xlsx...
Processing The_Night_Wire.xlsx...
Processing The_Old_Man_And_The_Sea.xlsx...
Processing The_Ransom_of_Red_Chief.xlsx...
Proces