Something cool would be to take the stacked images from different videos and meld them into one image by pinning them to each other horizontally. While stacking frames makes the cost almost negligible compared to analyzing the video, submitting multiple stacked frame images as one image and then splitting up the text later would be close to as cheap as possible.

You would then have to make sure not to merge too many stacked frame images that the jpg got too big so submit to the API, keep track of the order in which they were pinned, and note the width of each video to allow for grouping the text by frame by video afterwards.

In the second half of the final version, the loops to create text_loc_list are unnecessary because the steps after can be done with the dictionary created by reading the json file.

### Not Final Version

In [None]:
#@title 
from scipy.ndimage import filters
from numpy import *
import cv2
import pandas as pd
from scipy.signal import find_peaks
from PIL import Image
import os
import io

! pip install google-cloud-vision
from google.cloud import vision

! pip install fuzzywuzzy
! pip install python-Levenshtein

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import random

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/drive/My Drive/AFoote/google_application_credentials.json"


In [None]:
#@title 
def harris_corner_counter(im, sigma, corner_threshold):
  # 1. Obtain the greyscale version of the image
  gray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)

  # 2. Apply a Gaussian Filter to blur the image slightly/smooth out any noise
  '''   An increase in sigma will result in a blurrier image  '''
  gray_blur = filters.gaussian_filter(gray,sigma)

  # 3. Apply Sobel operator to find x and y gradient values for each pixel
  ''' Page 33 of Programming Computer Vision with Python has a really 
  explanation of image derivatives, gradient vectors, and gradient angles '''

  imx = zeros(gray_blur.shape)
  filters.gaussian_filter(gray_blur, (sigma,sigma), (0,1), imx)
    
  imy = zeros(gray_blur.shape)
  filters.gaussian_filter(gray_blur, (sigma,sigma), (1,0), imy)

  # 4. Compute the harris value for each pixel in the image
  Ixx = filters.gaussian_filter(imx*imx,sigma) 
  Ixy = filters.gaussian_filter(imx*imy,sigma) 
  Iyy = filters.gaussian_filter(imy*imy,sigma)

  har_val_im = (Ixx*Iyy - Ixy**2)/(Ixx + Iyy)

  # 5. For a given threshold, count the number of pixels that have a harris
  # value above that threshold, meaning they are corners (or more corner-ish)

  num = sum(har_val_im > corner_threshold)    # Sum up (true == 1, false == 0)
  return num

In [None]:
def corner_to_text(vid_path, sigma, corner_threshold):
  video = cv2.VideoCapture(vid_path)
  
  corner_list = []
  frame_list = [i for i in range(0,int(video.get(cv2.CAP_PROP_FRAME_COUNT)))]

  while True:
    success,frame = video.read()
    if not success:
        break
    gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
    gray_blur = filters.gaussian_filter(gray,sigma)


    imx = zeros(gray_blur.shape)
    filters.gaussian_filter(gray_blur, (sigma,sigma), (0,1), imx)
    
    imy = zeros(gray_blur.shape)
    filters.gaussian_filter(gray_blur, (sigma,sigma), (1,0), imy)

    Ixx = filters.gaussian_filter(imx*imx,sigma) 
    Ixy = filters.gaussian_filter(imx*imy,sigma) 
    Iyy = filters.gaussian_filter(imy*imy,sigma)

    har_val_im = (Ixx*Iyy - Ixy**2)/(Ixx + Iyy)
    c_in_f = sum(har_val_im > corner_threshold)

    corner_list.append(c_in_f)

  data = {"Frame":frame_list,
          "Corners":corner_list}

  # Get a dataframe with the corners and the associated frame
  df = pd.DataFrame(data = data)

  return df

def calc_peak_frames(corner_df):
  # Want a peak no more than every second 
  desired_peaks = int(corner_df.shape[0]/30)
  
  prominence = 0
  distance = 2
  peaks, _ = find_peaks(x = corner_df['Corners'], 
                              prominence = prominence, 
                              distance = distance)
  while len(peaks) > desired_peaks:
    peaks, _ = find_peaks(x = corner_df['Corners'], 
                              prominence = prominence, 
                              distance = distance)
    if len(peaks) > desired_peaks:
      prominence += .25
      distance += 6


  return peaks

def stack_peak_frames(peak_seq, vid_path):
  # Grab the frames from the video that have peak corner counts, stack them, and
  # run OCR on them

  ! mkdir temp
  video = cv2.VideoCapture(vid_path)

  for frames in peak_seq:
    video.set(1,frames)
    success,frame_im = video.read()
    if not success:
        break
    cv2.imwrite("/content/temp/{:07d}.jpg".format(frames), frame_im)

  im_list = os.listdir("/content/temp/")
  im_list.sort()
  images = [Image.open("/content/temp/" + x) for x in im_list]
  
  widths, heights = zip(*(i.size for i in images))

  max_width = max(widths)
  total_height = sum(heights)

  stack_im = Image.new('RGB', (max_width, total_height))

  y_offset = 0
  for im in images:
    stack_im.paste(im, (0,y_offset))
    y_offset += im.size[1]
    
    
  im_path = '/content/temp/key_frames.jpg'
  stack_im.save(im_path)

  '''
  ! gsutil cp $stack_im_path gs://qac-apprenticeship-afoote
  '''

  return im_path
  

def text_coords_list(path):
  image_file = io.open(path, 'rb')
  content = image_file.read()

  client = vision.ImageAnnotatorClient()
  image = vision.Image(content=content)

  response = client.text_detection(image=image)
  texts = response.text_annotations

  text_loc_list = []

  for text in texts:
    element_list = []
    element_list.append(text.description)

    vertices=([(vertex.x,vertex.y) for vertex in text.bounding_poly.vertices])
    element_list.append(vertices)

    text_loc_list.append(element_list)

  ! rm -rf /content/temp
  
  return text_loc_list

def group_text_by_frame(text_list, vid_path):
  
  text_dict = {}
  video = cv2.VideoCapture(vid_path)
  frame_h = video.get(cv2.CAP_PROP_FRAME_HEIGHT)

  for element in text_list:

    # Determine which frame the text belongs too
    frame_to_add_to = int(element[1][3][1]/frame_h)

    if frame_to_add_to in text_dict:
      text_dict[frame_to_add_to].append(element[0])
    else:
      text_dict[frame_to_add_to] = []
      text_dict[frame_to_add_to].append(element[0])
  
  tc = []
  for keys in text_dict:
    tc.append(' '.join(text_dict[keys]))
  return tc

def group_frames(text_list, threshold):
  random.seed()

  group_dict = {}
  group = 0

  index_list = [i for i in range(0,len(text_list))]
  index = random.choice(index_list)
  index_list.remove(index)

  query = text_list[index]
  choices = []
  for i in range(0,len(text_list)):
    if i in index_list:
      choices.append(text_list[i])
  
  while True:
    similarities = process.extract(query = query, choices = choices, limit = 1000)
    text_list = []
    text_list.append(query)
    print(similarities)
    return
    for sets in similarities:
        if sets[1] < threshold:
          group_dict["Group {}".format(group)]= text_list
          group += 1
          break
        else:
          text_list.append(sets[0])
          index_list.remove([sets[2]])
    index = random.choice(index_list)
    index_list.remove(index)

    query = text_list[index]
    choices = []
    for i in range(0,len(text_list)):
      if i in index_list:
        choices.append(text_list[i])
    if len(choices) == 0:
      break


  return group_dict

### Fold up

In [None]:
! rm -rf /content/temp

In [None]:
video = cv2.VideoCapture("/content/drive/MyDrive/AFoote/videos/x_241249350213043.mp4")
video.get(cv2.CAP_PROP_FRAME_COUNT)


240.0

In [None]:
df = corner_to_text(vid_path = "/content/drive/MyDrive/AFoote/videos/x_241249350213043.mp4",
                    sigma = 2,
                    corner_threshold = .075)
peaks = calc_peak_frames(corner_df = df)
im_path = stack_peak_frames(peak_seq = peaks,
                            vid_path = "/content/drive/MyDrive/AFoote/videos/x_241249350213043.mp4")
tl_sep = text_coords_list(path = im_path)
tl_join = group_text_by_frame(text_list = tl_sep,
                         vid_path = "/content/drive/MyDrive/AFoote/videos/x_241249350213043.mp4")
group_dict = group_frames(text_list = tl_join, threshold = 90)



[('TEXT "TRUMP" TO 88022 2020 CENSUS DON\'T LET THE FAKE NEWS AND DEMOCRATS DEFEAT PRESIDENT TRUMP! TAKE THE SURVEY PAID FOR BY TRUMP MAKE AMERICA GREAT AGAIN COMMITE.|', 100), ('TEXT "TRUMP" TO 88022 2020 CENSUS DON\'T LET THE FAKE NEWS AND DEMOCRATS DEFEAT PRESIDENT TRUMP! TAKE THE SURVEY PAIO FOR BY TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE', 99), ('TEXT "TRUMP" TO 88022 2020 CENSUS DON\'T LET THE FAKE NEWS AND DEMOCRATS DEFEAT PRESIDENT TRUMP! TAKE THE SURVEY PAID FOR BY TAUMP MAKE AMERICA GREAT AGAIN COMMITEE', 99), ('TEXT "TRUMP" TO 88022 2020 CENSUS DON\'T LET THE FAKE NEWS AND', 90), ('TEXT "TRUMP" TO 88022 2020 CENSUS DON\'T LET THE FAKE NEWS AND DEMOCRATS DEFEAT PRESIDENT', 90), ('TEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURVEY\nPAIO FOR 

In [None]:
tl_join

['TEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURVEY\nPAIO FOR BY TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURVEY\nPAID FOR BY TAUMP MAKE AMERICA GREAT AGAIN COMMITEE\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURVEY\nPAID FOR BY TRUMP MAKE AMERICA GREAT AGAIN COMMITE.|\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURVEY\nPAID FOR BY TRUMP MAKE AMERICA GREAT AGAIN COMMITEE\nTEXT "TRUMP" TO 88022\n2020\nCENSUS\nDON\'T LET\nTHE FAKE\nNEWS AND\nDEMOCRATS\nDEFEAT\nPRESIDENT\nTRUMP!\nTAKE THE SURV

# Final Version First Part

In [None]:
import cv2
from scipy.ndimage import filters
from numpy import *
import pandas as pd
from scipy.signal import find_peaks
from PIL import Image
import re
import os

In [None]:
def corner_to_stack(video_path_dir, target_directory, bookkeep_file_path,
                   fps = 30, sigma = 2, corner_threshold = .075):
  bookkeep = open(bookkeep_file_path, "w")
  bookkeep.write("Video,Frame Height\n")
  for video_path in os.listdir(video_path_dir):
    video_path = video_path_dir + "/" + video_path.rstrip()
    """ 
    Read through the video at 30 fps and calculate the number of corners in 
    each frame, creating a data frame with the frame number and corner count
    """

    video = cv2.VideoCapture(video_path)
    video.set(5,fps)
    # print("Video: \t" + video_path)
    corner_list = []
    

    while True:
      success,frame = video.read()
      if not success:
        break
      gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
      gray_blur = filters.gaussian_filter(gray,sigma)


      imx = zeros(gray_blur.shape)
      filters.gaussian_filter(gray_blur, (sigma,sigma), (0,1), imx)
    
      imy = zeros(gray_blur.shape)
      filters.gaussian_filter(gray_blur, (sigma,sigma), (1,0), imy)

      Ixx = filters.gaussian_filter(imx*imx,sigma) 
      Ixy = filters.gaussian_filter(imx*imy,sigma) 
      Iyy = filters.gaussian_filter(imy*imy,sigma)

      har_val_im = (Ixx*Iyy - Ixy**2)/(Ixx + Iyy)
      c_in_f = sum(har_val_im > corner_threshold)

      corner_list.append(c_in_f)
  
    frame_list = [i for i in range(0,len(corner_list))]

    
    df = pd.DataFrame(data = {"Frame":frame_list, "Corners":corner_list})

    """
    Calculate the frames at which the peak corner counts occur. desired_peaks
    ensures that there aren't more than a peak per second.
    """

    desired_peaks = int(df.shape[0]/fps)

    prominence = 0
    distance = 2
    peaks, _ = find_peaks(x = df['Corners'], 
                          prominence = prominence, 
                          distance = distance)
    while len(peaks) > desired_peaks:
      peaks, _ = find_peaks(x = df['Corners'], 
                            prominence = prominence, 
                            distance = distance)
      if len(peaks) > desired_peaks:
        prominence += .25
        distance += 6

    """
    Grab the frames from the video that have peak corner counts, stacking them
    into an image.

    When writing to a txt file to keep track of video attributes for text 
    processing post-OCR, you want the video name and the height of the video
    frames.
    """

    video_stack = cv2.VideoCapture(video_path)
    video_stack.set(5,fps)
  
    stack_im = Image.new('RGB', 
                         (int(video_stack.get(cv2.CAP_PROP_FRAME_WIDTH)), 
                          int(video_stack.get(cv2.CAP_PROP_FRAME_HEIGHT)*len(peaks))))
    y_offset = 0

    for indices in peaks:
      video_stack.set(1,indices)
      success, im = video_stack.read()
      if not success:
        print("Could not read frame {} in the video located at {}".format(indices,
                                                                          video_path))
        break
      im = Image.fromarray(cv2.cvtColor(im,cv2.COLOR_BGR2RGB))
      stack_im.paste(im, (0,y_offset))
      y_offset += im.size[1]
  
    """
    Write the stacked image into the given directory. Its name is the same as 
    the video the frames were stacked from (extension is different).
    """
    im_file_name = video_path.split("/")[-1]
    bookkeep.write("{},{}\n".format(im_file_name,
                                  int(video_stack.get(cv2.CAP_PROP_FRAME_HEIGHT))))
    im_file_name = re.sub(pattern = "\.mp4",
                          repl = ".jpg",
                          string = im_file_name)
    
    stack_im.save(target_directory + im_file_name)
    print(im_file_name)
    print(video_path + " key frames stacked")
    

  bookkeep.close()
  return

In [None]:
corner_to_stack(video_path_dir = "/content/drive/MyDrive/AFoote/videos",
               target_directory = "/content/drive/MyDrive/AFoote/More Stacked Images/", 
               bookkeep_file_path = "/content/cbk.txt")

In [None]:
# For testing image stacking for trump video
corner_to_stack(video_path_dir = "/content/drive/MyDrive/AFoote/trump video",
               target_directory = "/content/", 
               bookkeep_file_path = "/content/cbk.txt", 
               fps = 30)



/content/drive/MyDrive/AFoote/trump video/x_241249350213043.mp4 key frames stacked


Still to Do:
Need to implement the bookkeeping stuff where a CSV is used to keep track of properties of the video that are needed once the OCR has been run on all of the videos (done)

Also figure out the issue with saving the file and the erroneous file name (done)

Something weird that doesn't really make sense to me is that the time of the function doesn't seem to depend on the frames per second at which the videos are read. When the videos are read at 5,10,15, and 30 fps the function stacks the frames of the four videos and writes to the txt file in 94.598, 92.951, 94.114, and 94.225 seconds respectively. Because the amount of peaks and therefore the number of frames stacked depends on the length of the video, having more fps also doesn't increase the size of the output images after stacking the frames. For that reason, increasing fps doesn't seem to have any performance downsides that I can tell, although I haven't done a very in depth analysis.

# Final Version Second Part

The necessary inputs are the JSON file with the text elements and their bounding boxes, the name of the video, and the output file to which the text in the video along with the name of the video will be written as a CSV. One thing I am not sure of right now is how I'm going to know which video I am looking at currently. Maybe the JSON file includes the image from which it was generated?

In [None]:
import pandas as pd
import json
import os

! pip install fuzzywuzzy
! pip install python-Levenshtein

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import random

In [None]:
#@title Use if the order of the words in the output do not need to be the same as the video
def stack_to_text(json_file_dir, bookkeep_file_path, output_file_path,
                  similarity_threshold = 97):
  nh_df = pd.read_csv(bookkeep_file_path)
  nh_df = nh_df.set_index("Video", drop = False)

  csv_text_col_list = []
  for json_file in os.listdir(json_file_dir):
    ''' Load the json file as a dictionary '''
    f = open(json_file_dir + "/" + json_file,)
    j_dict = json.load(f)

    ''' Determine the frame height for the video's json file being examined'''
    vid_name = json_file.split(".")[0] + ".mp4"
    frame_height = nh_df.loc[vid_name,"Frame Height"]

    ''' Create a list of list elements where each element has an element that
    is a string and the second element is a list with the vertices of the 
    bounding boxes for the string '''
    
    text_loc_list = []
  
  

    texts = j_dict["responses"][0]["textAnnotations"][1:]


    for text in texts:
      element_list = []
      element_list.append(text['description'])
      '''
      for vertex in text['boundingPoly']['vertices']: #['vertices']
        print(vertex)
        vertices = 1
      '''
      vertices=([(vertex['x'],vertex['y']) for vertex in text['boundingPoly']['vertices']])
      element_list.append(vertices)

      text_loc_list.append(element_list)


    frame_list = []
    text_dict = {}
    for element in text_loc_list:
      # Determine which frame the text belongs too
      frame_to_add_to = int(element[1][3][1]/frame_height)

      if frame_to_add_to in text_dict:
        text_dict[frame_to_add_to].append(element[0])
      else:
        text_dict[frame_to_add_to] = []
        text_dict[frame_to_add_to].append(element[0])
        frame_list.append(frame_to_add_to)

    text_list = []
    for keys in text_dict:
      text_list.append(' '.join(text_dict[keys]))
    tf_df = pd.DataFrame(data = {"Frame": frame_list,
                                 "Text": text_list})
  

    '''
    Group the frames based on the similarity of the text in the frames relative
    to other frames.
    '''
    random.seed()
    group_dict = {}

    index_list = [i for i in range(0,tf_df.shape[0])]
    bool_s = [True for i in range(0,tf_df.shape[0])]
  
    index = random.choice(index_list)
  
    query = tf_df["Text"][index]
    bool_s[index] = False
    index_list[index] = -1
    choices = tf_df.loc[bool_s]["Text"]
    group = 0

    #When all elements grouped bool_s all false -> sum == 0
    while sum(bool_s) != 0: 
      similarities = process.extract(query = query, choices = choices, limit = 1000)

      ind_list = []
      ind_list.append(index)
      text_list = []
      text_list.append(query)

      for sets in similarities:
        if sets[1] < similarity_threshold:
          group_dict["Group {}".format(group)]= text_list
          group += 1
          for i in ind_list:
            ''' Set the frames already grouped to false so they are not considered
            in the next grouping process (they are already in a group!) '''
            bool_s[i] = False
            index_list[i] = -1
          break
        else:
          text_list.append(sets[0])
          ind_list.append(sets[2])

      while True:
        index = random.choice(index_list)
        if index != -1:
          break
      query = tf_df["Text"][index]
      bool_s[index] = False
      index_list[index] = -1
      choices = tf_df.loc[bool_s]["Text"]


    '''
    Get rid of non alpha-numeric characters and get rid of leading spaces.
    '''
    for keys in group_dict:
      for i in range(0,len(group_dict[keys])):
        # non-alphanumeric non-whitespace character removal
        group_dict[keys][i] = re.sub(pattern = '[^a-zA-Z\d\s]', 
                                          repl = "",
                                          string = group_dict[keys][i])
        # leading space removal
        group_dict[keys][i] = re.sub(pattern = '^ ', 
                                          repl = "",
                                          string = group_dict[keys][i])
      
    '''
    Find the most common words in each group. Also calculate the average length
    of string in each group, rounding down. Output that many of the most common 
    words as the string for the frames in that group.
    '''

    s_group = []

    for values in group_dict.values():
      avg_len = 0
      hist_dict = {}
      for s in values:
        s_list = s.split()
        avg_len += len(s_list)
        for words in s_list:
          if words in hist_dict:
            hist_dict[words] += 1
          else:
            hist_dict[words] = 1
      avg_len = int(avg_len/len(values))

      most_common_words = ""
      hist_dict_sorted = sorted(hist_dict.items(), key = lambda x: x[1], reverse = True)
  
      i = 0
      for words in hist_dict_sorted:
        if i >= avg_len:
          break
    
        most_common_words = most_common_words + words[0] + " "
        i += 1
    
  
      s_group.append(most_common_words)

    #print(s_group)
    csv_text_col_list.append(s_group)
  nh_df['Text'] = csv_text_col_list
  nh_df = nh_df.drop(columns = ['Video'])
  nh_df.to_csv(output_file_path)

In [None]:
#title Use If Output Text Must be in Same Order as Video
def stack_to_text(json_file_dir, bookkeep_file_path, output_file_path,
                  similarity_threshold = 95):
  nh_df = pd.read_csv(bookkeep_file_path)
  nh_df = nh_df.set_index("Video", drop = False)
  nh_df["Text"] = [""] * nh_df.shape[0]
  for json_file in os.listdir(json_file_dir):
    ''' Load the json file as a dictionary '''
    f = open(json_file_dir + "/" + json_file,)
    j_dict = json.load(f)

    ''' Determine the frame height for the video's json file being examined'''
    vid_name = json_file.split(".")[0] + ".mp4"
    frame_height = nh_df.loc[vid_name,"Frame Height"]


    texts = j_dict["responses"][0]["textAnnotations"][1:]
    text_dict = {}
    frame_list = []

    for text in texts:
      frame_to_add_to = int(text['boundingPoly']['vertices'][3]['y']/frame_height)

      if frame_to_add_to in text_dict:
        text_dict[frame_to_add_to].append(text['description'])
      else:
        text_dict[frame_to_add_to] = []
        text_dict[frame_to_add_to].append(text['description'])
        frame_list.append(frame_to_add_to)

    text_list = []
    for keys in text_dict:
      text_list.append(' '.join(text_dict[keys]))
    tf_df = pd.DataFrame(data = {"Frame": frame_list,
                                 "Text": text_list})
    """
    Below I try to use a cleaning method to get rid of words that are present
    throughout the entire video (usually some kind of banner). By getting rid
    of these words one can lower the grouping threshold since those words that
    naturally increase the similarity are gone. This means that misreadings are
    more likely to be grouped together. 

    However, for videos that are short, there are not a lot of frames to look
    at so when trying to identify the cross-group-words some of the actual words
    end up getting removed. I can't figure out a way to prevent this from
    happening so I'm going to set a minimum size of video (number of key frames)
    necessary for this cleaning to be undertaken. After some inspection a
    number that is at least decent is ten key frames.

    If the cleaning does end up happening, the grouping threshold is set lower
    because of the reason explained above.
    """
    if tf_df.shape[0] > 10:
      cross_group_words = []
      string_list = list(tf_df['Text'])
      min_length = len(string_list[0].split())

      for strings in string_list:
        if len(strings.split()) < min_length:
          min_length = len(strings.split())
    
      for i in range(0,min_length):
        hist = {}
        for strings in string_list:
          word = strings.split()[i]
          if word in hist:
            hist[word] += 1
          else:
            hist[word] = 1
      
        hist = dict(sorted(hist.items(), key=lambda item: item[1], reverse= True))

        if list(hist.values())[0] > len(string_list)*.6:
          cross_group_words.append(list(hist.keys())[0])
    
      new_string_list = []
      for strings in string_list:
        list_el = strings.split()
        for keys in cross_group_words:
          if keys in list_el:
            list_el.remove(keys)
        new_string_list.append(" ".join(list_el))
    
      tf_df = pd.DataFrame(data = {"Frame": frame_list,
                                 "Text": new_string_list})
      tf_df = tf_df[tf_df.Text != ""]
      tf_df.reset_index(drop=True, inplace=True)

      similarity_threshold = 90
    '''
    Group the frames based on the similarity of the text in the frames relative
    to other frames.
    '''
    random.seed()
    group_dict = {}

    index_list = [i for i in range(0,tf_df.shape[0])] 
    bool_s = [True for i in range(0,tf_df.shape[0])]
    group = 0
    
    
    #When all elements grouped, index_list is empty
    while len(index_list) > 0:
      index = index_list[0]
      index_list.remove(index)
      query = tf_df["Text"][index]
      bool_s[index] = False
      
      choices = tf_df.loc[bool_s]["Text"]

      similarities = process.extract(query = query, choices = choices, limit = 1000)
    
      text_list = []
      text_list.append(query)

      for sets in similarities:
        if sets[1] < similarity_threshold:
          group_dict["Group {}".format(group)]= text_list
          group += 1
          break
        else:
          text_list.append(sets[0])
          index_list.remove(sets[2])# index_list[sets[2]] = -1

    
    '''
    Get rid of non alpha-numeric characters and get rid of leading spaces.
    '''
    
    for keys in group_dict:
      for i in range(0,len(group_dict[keys])):
        # Running non-alphanumeric non-whitespace character removal
        group_dict[keys][i] = re.sub(pattern = '[^a-zA-Z\d\s][^a-zA-Z\d\s]', 
                                          repl = "",
                                          string = group_dict[keys][i])
        # leading space removal
        group_dict[keys][i] = re.sub(pattern = '^ ', 
                                          repl = "",
                                          string = group_dict[keys][i])
    
   
    """
    The dictionary with each key-value pair as a group of frames with similar
    text is now iterated through to get the output string from each group. If
    the group only has one element it is taken as the aggregate output from that
    group, as it is the only element to choose from. Otherwise, each word index
    is iterated through for the group and the frequency of the words at that
    index are counted. If a word occurs in more than half of the frames in that
    group, it is counted as the correct word at that index.

    After the text is determined for the video, a few last steps are taken to 
    improve the quality of the output. What is done is to go through the output
    from the groups and see if any group's output is a subset of the output
    from another group.  If one is a subset, it is removed as an element from
    the text in the video.
    """


    vid_text = []

    for lists in list(group_dict.values()):
      
      if len(lists) == 1:
        vid_text.append(str(lists[0]))
        continue
      
      min_length = len(lists[0].split())
      for strings in lists[1:]:
        if min_length > len(strings.split()):
          min_length = len(strings.split())
      
      group_text = []
      for i in range(0,min_length):
        hist = {}
        for strings in lists:
          word = strings.split()[i]
          if word in hist:
            hist[word] += 1
          else:
            hist[word] = 1

        hist = dict(sorted(hist.items(), key = lambda item: item[1], reverse = True))
        if list(hist.values())[0] > len(lists)*0.5:
          group_text.append(list(hist.keys())[0])

      vid_text.append(" ".join(group_text))
    
    for strings1 in vid_text:
      for strings2 in vid_text:
        if strings1 == strings2:
          continue
        if fuzz.partial_ratio(strings1, strings2) > 95:
          if len(strings1) > len(strings2):
            vid_text.remove(strings2)
          else:
            vid_text.remove(strings1)
  
    ft_dict = {}

    for strings in vid_text:
      similarity = process.extractOne(query = strings,
                                      choices = tf_df["Text"])
      ft_dict[strings] = similarity[2]

    output_text = "||".join(ft_dict.keys())


    nh_df.loc[vid_name,"Text"] = output_text
    similarity_threshold = 95


  nh_df = nh_df.drop(columns = ['Video'])
  nh_df.to_csv(output_file_path)
  return nh_df

In [None]:
nhdf = stack_to_text(json_file_dir = "/content/drive/MyDrive/AFoote/Stacked Frame Images",
              bookkeep_file_path = "/content/cbk.txt",
              output_file_path = "/content/cbk.csv")

In [None]:
fuzz.partial_ratio("dog", "dog eat dog world")

NameError: ignored

In [None]:
nhdf

Unnamed: 0_level_0,Frame Height,Text
Video,Unnamed: 1_level_1,Unnamed: 2_level_1
608582669747744.mp4,400,ALERT TIME IS RUNNING OUT TO RETURN YOUR BALLO...
x_241249350213043.mp4,400,"TEXT ""TRUMP"" TO 88022 2020 CENSUS DON'T LET TH..."
x_272336370493044.mp4,400,I'm Susan Collins. June 24th marks our pre-pri...
preserve_america_pac.mp4,224,JOHN GILLIS RET. LT LAPD||LOUARNA GILLIS||BIDE...


We care about keeping everything as ordered strings so maybe concatenate the strings of each group into one string so that you have one strings per group.

this whole grouping approach might need to get reworked. Maybe before grouping go through the words and get rid of all words that are not words. Then group? (I did this in the implementation)

In [None]:
''' Don't run so output is preserved!! 

This output is useful for getting an idea of how to index a level or two
deep into the json file

This output is the result of:
print(j_dict["responses"][0]["textAnnotations"][1:])
print("\n")

stack_to_text(json_file_dir = "/content/drive/MyDrive/AFoote/Stacked Frame Images",
              bookkeep_file_path = "/content/cbk.txt",
              output_file_path = "/content/drive")
'''

[{'description': 'BALLOT', 'boundingPoly': {'vertices': [{'x': 157, 'y': 89}, {'x': 237, 'y': 89}, {'x': 237, 'y': 104}, {'x': 157, 'y': 104}]}}, {'description': 'RETURN', 'boundingPoly': {'vertices': [{'x': 156, 'y': 105}, {'x': 237, 'y': 106}, {'x': 237, 'y': 121}, {'x': 156, 'y': 120}]}}, {'description': 'ALERT', 'boundingPoly': {'vertices': [{'x': 134, 'y': 204}, {'x': 266, 'y': 205}, {'x': 266, 'y': 236}, {'x': 134, 'y': 235}]}}, {'description': 'TIME', 'boundingPoly': {'vertices': [{'x': 81, 'y': 284}, {'x': 129, 'y': 284}, {'x': 129, 'y': 299}, {'x': 81, 'y': 299}]}}, {'description': 'IS', 'boundingPoly': {'vertices': [{'x': 139, 'y': 284}, {'x': 154, 'y': 284}, {'x': 154, 'y': 299}, {'x': 139, 'y': 299}]}}, {'description': 'RUNNING', 'boundingPoly': {'vertices': [{'x': 164, 'y': 284}, {'x': 266, 'y': 285}, {'x': 266, 'y': 301}, {'x': 164, 'y': 300}]}}, {'description': 'OUT', 'boundingPoly': {'vertices': [{'x': 275, 'y': 284}, {'x': 318, 'y': 284}, {'x': 318, 'y': 299}, {'x': 27