## Data Preparation for already chapterized data 
---------------------
*This notebook works best with the conda_python3 kernel on a ml.t3.medium machine*.

**Run this step if the data that is being used is already in chapterized format.**

#### Prerequisites:

1. Set up a `Python 3.11` conda environment with the packages in `requirements.txt` installed.

1. Mention the correct `file type to process` in `dir` section given in the [`config.yml`](config.yml) file 

1. This notebook is tested on `json` files that contains an array of `already chapterized data` and the `original full meeting transcript`

#### Run this notebook if a JSON file already contains chapterized data

In [None]:
# import libraries
import os
import json
import yaml
import glob
import logging
import pandas as pd
from typing import List
from IPython.display import display, HTML 

Set up a logger to log all messages while the code runs

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
 ## load the config file
CONFIG_FILE_PATH = "config.yml"

In [None]:
 # read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
def process_json_data(json_file: str) -> pd.DataFrame:
    """
    This function takes in a single json file, and inserts sections within a dataframe that contains information on
    1. the name of the chapterized file 
    2. the id of the given chapter and the 
    3. content of the chapter in that file
    """
    logger.info(f"json_file=\"{json_file}\"")
    with open(json_file, 'r', encoding='utf-8') as file_object:
        data = json.load(file_object)
    logger.info(f"chapter_data: {data}")
    file_name = os.path.basename(json_file)
    logger.info(f"Name of the file: {file_name}")
    df = pd.json_normalize(data['chapters'])
    df.insert(0, 'file_name', file_name)
    df.insert(1, 'chapter_id', df.index)
    df.drop(columns=['start_ms'], inplace=True)
    df.rename(columns={'transcript': 'text'}, inplace=True)
    return df

In [None]:
# iterate over files in that directory
# Create the directory if it doesn't exist
os.makedirs(config['dir']['processed'], exist_ok=True)
df_list: List[pd.DataFrame] = []
raw_files = glob.glob(os.path.join(config['dir']['raw'], f"*.{config['dir']['file_type_to_process']}"))
logger.info(f"there are {len(raw_files)} files to be processed ")

if config['dir']['file_type_to_process'] == 'json':
    df_list = list(map(lambda f: process_json_data(f), raw_files))
    df = pd.concat(df_list)
    fpath: str = os.path.join(config['dir']['processed'], config['dir']['chapterized_file'])
    df.to_csv(fpath, encoding='utf-8', header='true', index=False)
    logger.info(df.head())
    logger.info(f"Final processed dataframe of shape={df.shape} written to \"{fpath}\"")