## Setup required for all notebooks & data preparation for VTT files
---------------------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

**This step of our solution design covers running setup steps that need to be run prior to any other notebook being run.**

1. Prerequisite: a `Python 3.11` conda environment with the packages in `requirements.txt` installed.

1. In this notebook, we chapterize meeting transcripts given in the form of `VTT` files

In [None]:
# import libraries
import os
import json
import yaml
import glob
import logging
import pandas as pd
from typing import List
from chapterize import chapterize
from IPython.display import display, HTML 

Set up a logger to log all messages while the code runs

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
 ## load the config file
CONFIG_FILE_PATH = "config.yml"

In [None]:
 # read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
def process_vtt_data(vtt_file: str, max_chapter_length: int, header_lines_to_skip: int = 2, num_lines_per_message: int = 3) -> pd.DataFrame:
    """
    This function processes the VTT files provided, scans each file and converts it into a data frame with content on the file name, chapter id, 
    speaker_if, text, and other variables required in the data frame to get model generated titles in the next notebook.
    """
    logger.info(f"vtt_file=\"{vtt_file}\", max_chapter_length={max_chapter_length}, header_lines_to_skip={header_lines_to_skip}")
    desired_lines = []
    with open(vtt_file) as f:
        lines = [line.rstrip() for line in f]
        desired_lines = lines[header_lines_to_skip:]
    desired_lines = list(filter(None, desired_lines))
    file_name = os.path.basename(vtt_file)
    array_of_dicts = []
    chapter_count = 1
    for n in range(len(desired_lines) // num_lines_per_message):
        l1 = desired_lines[n * num_lines_per_message].split('"')
        name = f'"{l1[1]}"'
        speaker_id = l1[2].strip().strip('()')
        l2 = desired_lines[n * num_lines_per_message + 1].split(' --> ')
        start_time = l2[0]
        end_time = l2[1].strip()
        text = f'"{desired_lines[n * num_lines_per_message + (num_lines_per_message-1)]}"'
        new_row = {'file_name': file_name,
                   'chapter_id': chapter_count,
                   'name': name,
                   'speaker_id': speaker_id,
                   'start_time': start_time,
                   'end_time': end_time,
                   'text': text,}
        array_of_dicts.append(new_row)
        if (n + 1) % max_chapter_length == 0:
            chapter_count += 1
    df = pd.DataFrame(array_of_dicts)
    logger.info(f"Converted {vtt_file} into dataframe of shape={df.shape} after processing vtt file=\"{file_name}\"")
    return df

#### To run this cell, ensure that the `file_type_to_process` in the `dir` section of the config file is set to `vtt`

In [None]:
# iterate over files in that directory
 
# Create the directory if it doesn't exist
os.makedirs(config['dir']['processed'], exist_ok=True)

df_list: List[pd.DataFrame] = []
raw_files = glob.glob(os.path.join(config['dir']['raw'], f"*.{config['dir']['file_type_to_process']}"))
logger.info(f"there are {len(raw_files)} files to be processed ")
df_list = list(map(lambda f: process_vtt_data(f, config['title_generation_thresholds']['max_chapter_length']), raw_files))
df = pd.concat(df_list)

fpath: str = os.path.join(config['dir']['processed'], config['dir']['processed_file'])
df.to_csv(fpath, encoding='utf-8', header='true', index=False)
logger.info(df.head())
logger.info(f"Final processed dataframe of shape={df.shape} written to \"{fpath}\"")

In [None]:
df = chapterize(df)
fpath: str = os.path.join(config['dir']['processed'], config['dir']['chapterized_file'])
df.to_csv(fpath, encoding='utf-8', header='true', index=False)
logger.info(df.head())
logger.info(f"Final processed dataframe of shape={df.shape} written to \"{fpath}\"")