In [33]:
def breakdown_gc():
    """
    This function takes all of the webscraping, cleaning, and manipulating I have done in other notebooks, and combines the entire process into one huge function that asks for significantly more user input. 
    """


    ##### Setup #####
    


    # import necessary packages
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    import os
    import time
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    import glob
    from selenium.webdriver.chrome.service import Service as ChromeService
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    import re
    from datetime import datetime, timedelta

    # set working directory for accessing and saving files
    os.chdir('d:\\Faith and Religion Stuff\\Come, Follow Me\\come-follow-me-breakdown-builder')

    # ask for user input for link
    conf_link = input('Please paste the link to the landing page of the conference you would like to breakdown.')

    # ask for user input for year and month of conference
    month = input('Please enter whether the conference was held in April or October: ').lower().replace('il','',1).replace('ober','',1)
    year = input('Please enter the year of the conference: ').strip().replace('20','',1)
    month_year = month + year

    # ask for user input for the start date and end dates
    ini_start = str(input("What day would you like to start reading?"))
    ini_end = str(input("What day would you like to stop reading?"))

    # Convert user input into datetime objects
    ini_start_date = pd.to_datetime(ini_start,format='%m/%d/%Y')
    ini_end_date = pd.to_datetime(ini_end,format='%m/%d/%Y')
    
    # Get the total number of days for the breakdown plan
    ini_total_days = ini_end_date - (ini_start_date - timedelta(days=1))
    
    
    
    ##### Getting information such as author, title, and description of each link on the conference landing page. #####



    # get response
    response = requests.get(conf_link)

    # Define the path to the chromedriver executable
    chrome_driver_dir = r'D:\\Faith and Religion Stuff\\Come, Follow Me\\chromedriver-win64'
    chrome_driver_path = os.path.join(chrome_driver_dir, 'chromedriver.exe')

    # Set up the headless browser options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")

    # Set up the Chrome service
    service = Service(chrome_driver_path)    
        
    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Establish a try loop that tries to navigate to the provided link and find and store bits of info that we need 
    try:
        # Navigate to the page with your elements - in this case the April 2024 General Conference
        driver.get(conf_link)

        # Find all elements with the specified class name
        # Gotta use dots, not spaces, here because CSS considers each of those spaces to be defining a dif class object
        elements = driver.find_elements(By.CSS_SELECTOR, 'a.sc-omeqik-0.ewktus.list-tile.listTile-WHLxI')  

        # Initialize a list to store authors, titles, descriptions, and links
        primary_meta_list = []
        title_list = []
        description_list = []
        href_list = []

        # Iterate over each element
        # This for loop will, for all the videos/links to talks on the 2024 General Conference page, run through each of the following operations before moving onto the next
        for element in elements:
            # Try to get the author
            try:
                # Finds and stores the primary meta element (which is the author of the talk or report)
                primary_meta_element = element.find_element(By.CSS_SELECTOR,'p.primaryMeta')
                # Saves the stored author information as text
                primary_meta = primary_meta_element.text
            # If there is no author, save the author as None or Null
            except:
                primary_meta = None
            # Adds the author (or the None) to the list of authors in the appropriate row
            primary_meta_list.append(primary_meta)

            # Try to get the title - every link/video should have a title
            try:
                # Finds and stores the title element (the title of the video, talk, or report)
                title_element = element.find_element(By.CSS_SELECTOR,'p.title')
                # Saves the stored title as text
                title = title_element.text
            # If there is no title, save the title as None - THIS SHOULD NEVER BE THE CASE
            except:
                title = None
            # add the title (or the None) to the list of titles in the appropriate row
            title_list.append(title)

            # Try to get the description - the summary blurb about the video, talk, or report
            try:
                # Finds and stores the description element (the title of the video, talk, or report)
                description_element = element.find_element(By.CSS_SELECTOR,'p.description')
                # Saves the stored description as text
                description = description_element.text
            # If there is no description, save it as None - THIS SHOULD NEVER BE THE CASE
            except:
                description = None
            # add the title (or the None) to the list of descriptions in the appropriate row
            description_list.append(description)

            # Finds and stores the link (or href) to the video, talk, or report
            # This is ultimately going to be the information we use later to get the lengths (in paragraphs) of the talks and the lengths (in lines) of each of those paragraphs
            href = element.get_attribute('href')
            # Adds the stored href to the list of hrefs in the appropriate row
            href_list.append(href)

        # Creates a dataframe to store all the found and stored lists together
        ini_conf_df = pd.DataFrame({
            'Author': primary_meta_list,
            'Title': title_list,
            'Description': description_list,
            'Link': href_list
        })

    # If anything doesn't work for some reason, tell why
    except Exception as e:
        print(f"An error occurred: {e}")

    # After running everything, close the driver we opened to collect the data
    finally:
        # Close the browser
        driver.quit()



    ##### Removing things we don't need. #####
    


    # initialize empty list of rows that need to be dropped
    rows_to_drop = []

    # Adds the indexes (or row numbers) of rows to the list of rows to be dropped if there is either no Author or Description
        ## This exclusionary list is easy to edit
    for index,row in ini_conf_df.iterrows():
        if row['Author'] == None:
            rows_to_drop.append(index)
        elif row['Description'] == None:
            rows_to_drop.append(index)
        elif 'Sustaining' in row['Title']:
            rows_to_drop.append(index)
        elif 'Audit' in row['Title']:
            rows_to_drop.append(index)
    
    # Drops the rows in the list of rows to drop from the dataframe and resets the index
        ## This eliminates from the dataframe the session videos and the sustaining of the officers of the Church
    conf_df_1 = ini_conf_df.drop(rows_to_drop).reset_index(drop=True)
    
    
    
    ##### Getting additional information about each talk. ##### 



    # Define function for getting the total number of lines all talks 
        ## This function uses the urls stored in the dataframe
    def get_total_lines(url):
        """
        This function was designed specifically to run using an already active webdriver to gather paragraph and line length information about a general conference talk.
        First, it uses a webdriver to navigate to a url and then finds the paragraphs within a body block, and gets the size of the rectangles within which each of those paragraphs are assigned to appear. 
        Then, calculates the height of each line, saves that number as an integer, and calculates how many of those lines would fit into the assigned rectangle. 
        Then, it adds the paragraph number and the number of lines in that paragraph to the previously created dataframe.
        Finally, it calculates and returns the total number of lines in the talk by getting the sum of all paragraph lengths in lines. 
        """
        # initialize empty dataframe "data_list", with columns "paragraph" and "lines" being initially populated with NA values
        data_list = pd.DataFrame()
        data_list['paragraph'] = pd.NA
        data_list['lines'] = pd.NA

        # Find all elements containing the text
        paragraphs = driver.find_elements(By.CSS_SELECTOR, '.body-block p')

        # Iterate over each paragraph element
        for index, paragraph in enumerate(paragraphs, start=1):
            # Log paragraph number, since the paragraphs are not numbered. 
            paragraph_number = index

            # Get the bounding rectangle of the element
            rect = paragraph.rect

            # Calculate line height
            line_height_str = driver.execute_script("return window.getComputedStyle(arguments[0]).getPropertyValue('line-height');", paragraph)
            line_height_numeric = int(re.search(r'\d+', line_height_str).group())  # Extract numeric value from string

            # Calculate number of lines
            num_lines = rect['height'] // line_height_numeric

            # Append data dictionary to list
            data_list.at[index,'paragraph'] = paragraph_number
            data_list.at[index, 'lines'] = num_lines

            total_lines = sum(data_list['lines'])
        
        return total_lines
    
    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    # Copy conf_df_1 to create a reversion point if necessary (it won't be)
    conf_df_2 = conf_df_1.copy()
    
    # Initialize columns in the dataframe with NA values to later be filled
    conf_df_2['time'] = pd.NA
    conf_df_2['paragraphs'] = pd.NA
    conf_df_2['lines'] = pd.NA
    conf_df_2['role'] = pd.NA

    # iterate the following over each row in the apr_2024_df dataframe
    for index, row in conf_df_2.iterrows():
        # for each row, when the function calls for title, url, and author it is looking for the Title, Link, and Author columns in that row, respectively
        title = row['Title']
        url = row['Link']
        author = row['Author']

        # run the driver, navigating to the linked page in the row currently being worked on
        driver.get(url)

        # Wait for the page to load completely
        driver.implicitly_wait(10)

        # Simulate clicking the play button using the class attribute
        try:
            play_button = driver.find_element(By.CSS_SELECTOR, "button.sc-1g7hsbc-0.bCKkuP.sc-bvqtyr-3.eKGiZd")
            play_button.click()
            print(f"Clicked the play button for {title} to start the media.")
        except:
            print(f"Play button for {title} not found.")

        # Wait for the video element to be present in the DOM
        try:
            video_element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, 'video'))
            )
            print("Video element found in the DOM.")
        except:
            print("No video element found.")

        # Wait for a short period to allow the video to start loading
        time.sleep(2)

        # Attempt to retrieve the video duration using JavaScript
        try:
            video_duration = driver.execute_script("""
                let video = document.querySelector('video');
                if (video) {
                    console.log('Video element is present, checking duration...');
                    return video.duration;
                } else {
                    let audio = document.querySelector('audio');
                    if (audio) {
                        console.log('Audio element is present, checking duration...');
                        return audio.duration;
                    }
                }
                return null;  // No media element found
            """)
            
            # if video_duration exists
            if video_duration:
                # print a message saying how long the talk is in seconds
                print(f"{title} duration: {video_duration:.2f} seconds")
                # save the duration into the dataframe in the same row
                conf_df_2.at[index, 'time'] = video_duration
            # otherwise, print a message saying no video or audio element was found for the talk
            else:
                print(f"No video or audio element found for {title}.")
        
        # If there is an error, say there was an error and what it was, and try to get the next piece of information       
        except Exception as e:
            print(f"Error retrieving video duration for {title}: {e}")

        # attempt to find the body block
        try:
            paragraphs = driver.find_elements(By.CSS_SELECTOR, '.body-block p')
            # if body block is found, find the number of paragraphs, and save that number to the dataframe in the same row
            conf_df_2.at[index, 'paragraphs'] = len(paragraphs)
            # print a message giving the length of the talk in paragraphs
            print(f"Paragraph length of {title}: {len(paragraphs)} paragraphs.")

            # Use the get_total_lines function to get the total number of lines in the talk
            num_lines = get_total_lines(url)
            
            # save the number of lines to the dataframe in the same row
            conf_df_2.at[index, 'lines'] = num_lines
            
            # print a message telling the number of lines in the talk
            print(f"Line length of {title}: {num_lines} lines.")

        # if there is an error or a problem, print a message saying what the problem was, and try to get the next piece of information 
        except Exception as e:
            print(f"Error calculating lines and paragraphs for {title}: {e}")
        
        # try to find the author role
        try:
            role = driver.find_element(By.CLASS_NAME, 'author-role')
            # if there is one, save it to the dataframe in the same row
            conf_df_2.at[index, 'role'] = role.text
            # print a message displaying the role of the author
            print(f"Role of {author}: {role.text}")
        # if there is an error or a problem, print a message saying what the problem was and then move onto the next row
        except Exception as e:
            print(f"Error retrieving role for {author}: {e}")

    # Close the browser after all rows have been iterated through
    driver.quit()


    # converts all numeric columns to integers for easier use later
    conf_df_2['time'] = conf_df_2['time'].astype(int).round(0)
    conf_df_2['paragraphs'] = conf_df_2['paragraphs'].astype(int)
    conf_df_2['lines'] = conf_df_2['lines'].astype(int)



    ##### Establishing a primary key column and getting read and day weights
    


    # copy conf_df_2 to establish a reversion point
    conf_df_3 = conf_df_2.copy()

    # copies the role column onto a newly created short_role column
    conf_df_3['short_role'] = conf_df_3['role']

    # initializes a replacement dictionary to shorten information in newly created 'short_role' column
    rep_dict = {}

    # Adds specific shortenings of each role to the replacement dictionary
        ## this list is also easily editable if any other office becomes prominently represented in future conferences
        ## this list also puts members of the Presidency of the Seventy and of any other member of any other Quorum of the Seventy on equal ground       
    for index, row in conf_df_3.iterrows():
        if 'President of The Church'in row['role']:
            rep_dict[row['short_role']] = 'President of the Church'
        elif 'First Presidency' in row['role']:
            rep_dict[row['short_role']] = 'First Presidency'
        elif 'Quorum of the Twelve' in row['role']:
            rep_dict[row['short_role']] = 'Quorum of the Twelve'
        elif 'the Seventy' in row['role']:
            rep_dict[row['short_role']] = 'Seventy'
        elif 'Relief Society' in row['role']:
            rep_dict[row['short_role']] = 'Relief Society Presidency'
        elif 'Presiding' in row['role']:
            rep_dict[row['short_role']] = 'Presiding Bishopric'
        elif 'Sunday School' in row['role']:
            rep_dict[row['short_role']] = 'Sunday School Presidency'
        elif 'Young Men' in row['role']:
            rep_dict[row['short_role']] = 'Young Men Presidency'
        elif 'Young Women' in row['role']:
            rep_dict[row['short_role']] = 'Young Women Presidency'
        elif 'Primary' in row['role']:
            rep_dict[row['short_role']] = 'Primary Presidency'
        else:
            rep_dict[row['short_role']] = 'other speakers'
    
    # uses replacement dictionary to replace (shorten) all the entries in the short_role column
    for words, replacement in rep_dict.items():
        conf_df_3['short_role'] = conf_df_3['short_role'].replace(words, replacement).str.strip()
    
    # defines function to get the initials of the speaker for use in creation of primary key column
    def get_initials(full_name):
        parts = full_name.split()
        initials = [part[0].lower() for part in parts]
        return ''.join(initials)

    # Create a new column with initials
    conf_df_3['initials'] = conf_df_3['Author'].apply(get_initials)

    # create a primary key column that combines the initials of the speaker and the month and year of the conference
    conf_df_3['pk'] = (conf_df_3['initials'] + "_" + month_year)

    # initialize an empty list of read weights
    read_weights = []

    # for every unique role code in the role_code column of the apr_2024_info dataframe...
    for short_role in conf_df_3.short_role.unique():
        # ... ask the user what the read weight should be and...
        read_weight = int(input(f"How many times would you like to read talks given by the {short_role}?"))
        # ... save both the role code and the read weight to the read_weights list
        read_weights.append({'short_role':short_role, 'read_weight':read_weight})

    # convert the read_weights list to a dataframe, save with the same name to replace the old item
    read_weights = pd.DataFrame(read_weights)

    # left-merge the read_weights dataframe to the apr_2024_info dataframe useing the role_code columns as a guide for merging
    # left-merge keeps everything in the dataframe being merged to, and only merges data from the second dataframe that has a corresponding value in the original dataframe
    conf_df_3 = conf_df_3.merge(read_weights, on='short_role',how='left')

    # initialize an empty list of day weights
    day_weights = []

    # for every unique role code in the role_code column of the apr_2024_info dataframe...
    for short_role in conf_df_3.short_role.unique():
        # ... ask the user what the day weight should be and...
        day_weight = int(input(f"How many more or fewer days would you like to spend on talks given by the {short_role}?\n"
                               f"\nIf you want to spend more days reading talks from the {short_role}, enter a number above 0.\n"
                               f"\nOr if you want to spend fewer days reading talks from the {short_role}, enter a number below 0 by using a minus sign or dash.\n"
                               f"\nIf you would rather spend a relatively the same amount of time on each talk from this organization as others, enter 0."))
        # ... save both the role code and the day weight to the read_weights list
        day_weights.append({'short_role':short_role, 'day_weight':day_weight})

    # convert the read_weights list to a dataframe, save with the same name to replace the old item
    day_weights = pd.DataFrame(day_weights)

    # left-merge the read_weights dataframe to the apr_2024_info dataframe useing the role_code columns as a guide for merging
    # left-merge keeps everything in the dataframe being merged to, and only merges data from the second dataframe that has a corresponding value in the original dataframe
    conf_df_3 = conf_df_3.merge(day_weights, on='short_role',how='left')
    


    ##### Getting information about each talk



    # save a copy of conf_df_3 as a reversion point
    conf_df_4 = conf_df_3.copy()

    # define a function that takes a link and gets the text and counts the lines of text of each talk given in the linked conference
    def get_talks(talk_link):
        """
        This function finds the description and text of a talk found at the talk link, numbers each paragraph of that talk and gets the text and length in lines of each paragraph. It returns as a dataframe all this information about each talk. 
        """

        # Define the path to the chromedriver executable
        chrome_driver_dir = r'D:\\Faith and Religion Stuff\\Come, Follow Me\\chromedriver-win64'
        chrome_driver_path = os.path.join(chrome_driver_dir, 'chromedriver.exe')

        # Set up the headless browser options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")

        # Set up the Chrome service
        service = Service(chrome_driver_path)    
        
        # Initialize the Chrome WebDriver
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Run the driver
        driver.get(talk_link)

        # Initialize a list to store data dictionaries
        data_list = []

        # Try to get the description - the summary blurb about the video, talk, or report
        try:
            # Finds and stores the description (kicker) element (the title of the video, talk, or report)
            description_element = driver.find_element(By.CSS_SELECTOR,'p.kicker')
            # Saves the stored description as text
            description = description_element.text

            # set paragraph number for description as 0
            paragraph_number = 0
            
            # Get the bounding rectangle of the element
            rect = description_element.rect

            # Calculate line height
            line_height_str = driver.execute_script("return window.getComputedStyle(arguments[0]).getPropertyValue('line-height');", description_element)
            line_height_numeric = int(re.search(r'\d+', line_height_str).group())  # Extract numeric value from string

            # Calculate number of lines
            num_lines = rect['height'] // line_height_numeric

            # add text of and information about description to the data_list dictionary
            data_list.append({
                'paragraph_number': paragraph_number,
                'num_lines': num_lines,
                'text': description
            })

        # If there is no description, save it as None - THIS SHOULD NEVER BE THE CASE
        except:
            description = None

        # Find all elements containing the text
        paragraphs = driver.find_elements(By.CSS_SELECTOR, '.body-block p')

        # Iterate over each paragraph element
        for index, paragraph in enumerate(paragraphs, start=1):
            # Get the text of the element
            text = paragraph.text

            # Log paragraph number, since the paragraphs are not numbered. 
            paragraph_number = index

            # Get the bounding rectangle of the element
            rect = paragraph.rect

            # Calculate line height
            line_height_str = driver.execute_script("return window.getComputedStyle(arguments[0]).getPropertyValue('line-height');", paragraph)
            line_height_numeric = int(re.search(r'\d+', line_height_str).group())  # Extract numeric value from string

            # Calculate number of lines
            num_lines = rect['height'] // line_height_numeric

            # Append data dictionary to list
            data_list.append({
                'paragraph_number': paragraph_number,
                'num_lines': num_lines,
                'text': text
            })

        # Convert list of dictionaries to DataFrame
        df = pd.DataFrame(data_list)

        # Close the browser
        driver.quit()
        return df
    
    # Define directory path for saving CSV files
    dir_path = input(f'Please paste here the location of the folder in which you would like to store information from this general conference.\n'
                     f'\nExample:    D:\\Faith and Religion Stuff\\Come, Follow Me Breakdowns\\April 2024 GC Talks\n'
                     f'\nThis will require you to have already created a folder in which you want the information for the talks to be saved.')

    # establish a loop that iterates through every row of the conference dataframe
    for index, row in conf_df_4.iterrows():
        # save information from the 'Title' row as title
        title = row['Title']
        # save information from the 'Link' row as link
        link = row['Link']
        # save information from the 'pk' Primary Key row as foreign_key
        foreign_key = row['pk']

        # using the saved link, create a dataframe that contains the paragraph line counts information for the talk in the current row 
        talk_lines_df = get_talks(link)

        # if the created dataframe is not populated with None data and is not empty:
        if talk_lines_df is not None and not talk_lines_df.empty:
            # creates a new column in the dataframe that uses the primary key of the talk as the foreign key
            talk_lines_df['foreign_key'] = foreign_key
            # Uses a lambda function to create a primary key for each paragraph consisting of the foreign key + the paragraph number
            talk_lines_df['pk'] = talk_lines_df.apply(lambda x:f"{foreign_key}_{x.get('paragraph_number')}", axis=1)
        
            # very rudimentarily define which columns to keep, and add column names to that list in the desired order
            columns_to_keep = ['foreign_key','pk']
            columns_to_keep.append('paragraph_number')
            columns_to_keep.append('text')
            columns_to_keep.append('num_lines')

            # save dataframe with columns in the order specified in the columns_to_keep list
            talk_lines_df = talk_lines_df[columns_to_keep]
            
            # save file name
            csv_filename = f'{foreign_key}_lines.csv'
            # combine the file name and the user inputted folder location to create a complete save path
            full_path = os.path.join(dir_path,csv_filename)

            # Debugging line to state where files can be found
            print(f'Saving to: {full_path}')

            # export dataframe as a csv file to the location specified
            talk_lines_df.to_csv(full_path, index = False)
            # print message saying that csv file has been created
            print(f'CSV file for "{title}" saved successfully as {csv_filename}.')
        
        # otherwise, if the dataframe is filled with None values or is empty
        else:
            # print a message saying no data was found for the talk
            print(f'No data found for "{title}", skipping CSV creation.')
    
    ### stupidly import data I just exported because I don't have the bandwidth to come up with another solution and want to go to bed ###

    # define the beginning of the file location
        ## this is done by accessing the dir_path given by the user, and adding \\ to the end of it.
    path_start = f'{dir_path}\\'

    # initialize filenames list
    csv_files = []

    # get the names of all the csv files in the directory
    for file in os.listdir(path_start):
        if file.endswith(".csv"):
            csv_files.append(file)

    # initialize a dictionary to store the dataframes
    all_talks_dict = {}

    # import the csv files into pandas dataframes, store each dataframe in the dictionary
    for file in csv_files:
        talk = file[:-4]
        all_talks_dict[talk] = pd.read_csv(os.path.join(path_start, file))
        print(f'file string: {file}\n'
            f'talk string: {talk}')

    # iterate over every dataframe stored in the all_talks_df dictionary
    for talk, df in all_talks_dict.items():
        # create a new column in each dataframe that is the cumulative sum of the number of lines
        df['running_lines'] = df['num_lines'].cumsum()


    
    ##### Assigning a number of days for each readthrough of each talk #####


    
    # Since conf_df_4 was not altered in anyway during the previous major step, we don't need to save a copy

    # Get some information about the conference as a whole
    total_time = sum(conf_df_4['time'])
    total_lines = sum(conf_df_4['lines'])
    
    # create a new conference consumption column that gives a proportional weight to each talk based on it's length and user input
    conf_df_4['conf_cons'] = (
        ((1/32) +                                                                  # Each talk is 1 of 32 given, this treats each equally
        conf_df_4['time']/total_time +                                             # time weight - longer "heavier"
        conf_df_4['lines']/total_lines +                                           # lines weight - longer "heavier"
        ((conf_df_4['day_weight'] + 1)/(conf_df_4['day_weight'] + 1).sum()))       # preference weight - user input factors in here
        / 4                                                                        # Adding each of those and then dividing by 4 gets the average
    )
    
    # create a column with the total number of days to be spent on each talk
        ## multiply the number of days specified in the plan by the conference consumption ratio
    conf_df_4['tot_num_days'] = ini_total_days.days * conf_df_4['conf_cons']
    conf_df_4['tot_num_days'] = conf_df_4['tot_num_days'].round()

    # convert the newly created column into integers rather than floats
    conf_df_4['tot_num_days'] = conf_df_4['tot_num_days'].astype(int)

    # find and save the highest number in the read_weights column
    max_reads = conf_df_4.read_weight.max()

    # start a loop that, for every number between 1 and whatever the max_reads number is, inclusive...
    for i in range(1,max_reads+1):
        # create a new column of NA values titled "Readthrough # _(whatever number the loop is on)_"
        conf_df_4[f"Readthrough #{i}"] = pd.NA
        
    # convert all NA values to "0"
    conf_df_4.fillna(0, inplace=True)

    def distribute_days(conf_df):
        """
        This function takes a dataframe like the one I have crafted above and distributes the total number of days into the "Readthrough #_" columns.
        """
        # establish that the function needs to repeat for every row of the dataframe
        for index, row in conf_df.iterrows():
            # get total number of days for that talk
            total_days = row['tot_num_days']
            # initialize number of distributed days as 0
            dist_days = 0
            # establish that the function needs to proceed with the following operation until dist_days and total_days are equal
            while dist_days < total_days:
                # for every whole number between 1 and whatever the read_weight (or number of readthroughs) is...
                for i in range(1,row['read_weight']+1):
                    # if dist_days is still less than total_days...
                    if dist_days < total_days:
                        # add 1 to whatever value is in the "Readthrough #(number between 1 and number of readthroughs)" column and...
                        conf_df.at[index, f'Readthrough #{i}'] += 1
                        # add 1 to dist_days
                        dist_days += 1
                        # go back to add 1 to the next column until dist_days is no longer less than total_days
                    # if/when dist_days is equal to total_days
                    else:
                        # break the process of adding one to each column, and move on to the next row to start the process over
                        break
        # when everything is done, the output of this function is the same dataframe with all of the updated columns
        return conf_df

    # run the function on my dataframe
    conf_df_5 = distribute_days(conf_df_4)



    ##### Distributing lines from each talk across each day of each readthrough #####

    # Use a series of loops to create a line start, number of lines, line end, paragraph start, and paragraph end column for every day of every readthrough of every talk
    for i in range(1, conf_df_5['read_weight'].max()+1):
        for x in range(1, conf_df_5[f'Readthrough #{i}'].max()+1):
            for index, row in conf_df_5.iterrows():
                if row[f'Readthrough #{i}'] != 0:
                    conf_df_5.at[index, f'r{i}d{x}_l_start'] = int(0)
                    conf_df_5.at[index, f'r{i}d{x}_lines'] = int(0)
                    conf_df_5.at[index, f'r{i}d{x}_l_end'] = int(0)
                    conf_df_5.at[index, f'r{i}d{x}_p_start'] = int(0)
                    conf_df_5.at[index, f'r{i}d{x}_p_end'] = int(0)
                else:
                    conf_df_5.at[index, f'r{i}d{x}_l_start'] = pd.NA
                    conf_df_5.at[index, f'r{i}d{x}_lines'] = pd.NA
                    conf_df_5.at[index, f'r{i}d{x}_l_end'] = pd.NA
                    conf_df_5.at[index, f'r{i}d{x}_p_start'] = pd.NA
                    conf_df_5.at[index, f'r{i}d{x}_p_end'] = pd.NA
    
    # establishes a regular expression pattern
    pattern = r'\D\d\D\d+'
    # identifies columns that are floats and not integers
    float_cols = conf_df_5.select_dtypes(include=['float'])
    # targets specific float columns using regular expression pattern
    change_cols = [col for col in float_cols if re.search(pattern, col)]
    # converts targeted columns to integers
    conf_df_5[change_cols] = conf_df_5[change_cols].astype(int)

    # I struggled for about 5 hours because I forgot this step. Including it was as simple as a copy paste.
    # This also came in answer to my prayer for help. I prayed for God to help me know where to look, and this was the next thing I looked at. 
    # God is good. 
    
    def distribute_lines(conf_df):
        """
        This function takes a dataframe like the one I have crafted above and distributes the total number of days into the "Readthrough #_" columns.
        """
        # initialize readthrough count as zero
        readthrough = 0
        # establish maximum number of readthroughs so the loop I create below knows when to end or stop
        max_readthroughs = conf_df['read_weight'].max()
        # start a loop of action that will continue until readthroughs is bigger than max_readthroughs, at which point it will stop
        while readthrough <= max_readthroughs:
            # A - first action: add 1 to readthroughs, establishing which readthrough we are working with
            readthrough += 1
            # B - Check if the current readthrough exists in the DataFrame
            if f'Readthrough #{readthrough}' not in conf_df.columns:
                break  # If the column doesn't exist, exit the loop

            # C - second action: for every row in the dataframe do the following:
            for index, row in conf_df.iterrows():
                # D - second action, cont'd: save the info in the row 'lines' as the total number of lines to be distributed
                total_lines = row['lines']
                # E - second action, cont'd: initialize count of distributed lines as zero
                dist_lines = 0
                # debugging print line - remove later
                print(f"Row {index} - Readthrough {readthrough} has {row[f'Readthrough #{readthrough}']} days.")
                # F - check whether the number of days assigned to a talk in a particular readthrough is 0
                if row[f'Readthrough #{readthrough}'] == 0:
                    # debugging print line - remove later
                    print(f"Skipping row {index} - Readthrough {readthrough} because it has 0 days.")
                    # if that talk has zero days alloted for that readthrough, skip to the next talk
                    continue
                
                # G - second action, cont'd: open another while loop that will continue until the number of distributed lines is equal to the number of total lines
                while dist_lines < total_lines:
                    # H - first action of second loop: open another loop that operates i number of times, where i is the number of days in the readthrough
                    for i in range(1, row[f'Readthrough #{readthrough}']+1):
                        # debugging print line - remove later
                        print(f"Processing row {index}, readthrough {readthrough}, day {i}.")
                        # I - first action of second loop, cont'd: add 1 to the corresponding i day of the current readthrough
                        conf_df.at[index, f'r{readthrough}d{i}_lines'] += 1
                        # J - first action of second loop, cont'd: add 1 to the number of distributed lines
                        dist_lines += 1
                        # debugging print line - remove later
                        print(f"Distributed lines: {dist_lines}/{total_lines}")
                        # K - check if dist_lines is less than total_lines: 
                        if dist_lines >= total_lines:
                            break
                            # Unstated action: If it is less, return to point H and repeat this loop.
                            # If not, end *this* loop and return to point D for the next row.
                    
            # L - check if readthroughs is less than or equal to max_readthroughs
            if readthrough > max_readthroughs:
                # If it is, repeat this loop, starting with point A
                break
                # Unstated action: If not, end this loop and go to point M.
        
        # M - return the newly modified dataframe
        return conf_df
    
    conf_df_5 = distribute_lines(conf_df_5)
    
    ##### Assigning lines and paragraphs for each day of each readthrough. Final Major Step. #####


    # save a copy of conf_df_5 as a reversion point
    conf_df_6 = conf_df_5.copy()
    

    def get_paragraphs(conf_df,talks_dictionary):
        """
        This function assigns starting and ending lines and paragraphs for each day of each readthrough of each talk contained in the conference dataframe.
        """    
        # initialize readthrough count as 0
        rt = 0
        # establish maximum number of readthroughs so the loop I create below knows when to end or stop
        max_rts = conf_df['read_weight'].max()
        # create a list of the column names in the given dataframe
        cols = list(conf_df.columns)
        # start a loop of action that will continue until readthroughs is bigger than max_readthroughs, at which point it will stop
        while rt <= max_rts:
            # A - first action: add 1 to readthroughs, establishing which readthrough we are working with
            rt += 1
            # debugging line - remove later
            print(f'Starting readthrough {rt}.')
            # B - Check if the current readthrough exists in the DataFrame
            if f'Readthrough #{rt}' not in conf_df.columns:
                # debugging line - remove later
                print(f'Readthrough {rt} not found. Function complete.')
                # if a column for the current readthrough number doesn't exist, exit the loop
                break

            # C-1 - Second action: start another loop that does the follow for the highest number of days in whatever readthrough number the function is on
            for day in range(1, conf_df[f'Readthrough #{rt}'].max()+1):
                
                # debugging line - remove later
                print(f'Working through readthrough {rt} day {day}.')

                # C-2 - set patterns for pulling out info for the start, lines, end, and start of next day
                rt_start_l_pattern = fr'r{rt}d{day}_l_start'
                rt_lines_pattern = fr'r{rt}d{day}_lines'
                rt_end_l_pattern = fr'r{rt}d{day}_l_end'
                rt_next_start_l_pattern = fr'r{rt}d{day+1}_l_start'
                rt_start_p_pattern = fr'r{rt}d{day}_p_start'
                rt_end_p_pattern = fr'r{rt}d{day}_p_end'
                rt_next_start_p_pattern = fr'r{rt}d{day+1}_p_start'

                # debugging line - remove later
                print(f'Readthrough {rt} day {day} patterns saved.')

                # C-3 - initialize variables as empty lists
                start_l_col = []
                lines_col = []
                end_l_col = []
                next_start_l_col = []
                start_p_col = []
                end_p_col = []
                next_start_p_col = []

                # debugging line - remove later
                print(f'Variables initialized.')

                # C-4 - search for and save columns defined in patterns
                for col in cols:
                    if re.search(rt_start_l_pattern, col):
                        start_l_col.append(col)
                    elif re.search(rt_lines_pattern, col):
                        lines_col.append(col)
                    elif re.search(rt_end_l_pattern, col):
                        end_l_col.append(col)
                    elif re.search(rt_next_start_l_pattern, col):
                        next_start_l_col.append(col)
                    elif re.search(rt_start_p_pattern, col):
                        start_p_col.append(col)
                    elif re.search(rt_end_p_pattern, col):
                        end_p_col.append(col)
                    elif re.search(rt_next_start_p_pattern, col):
                        next_start_p_col.append(col)

                # debugging line - remove later
                print(f'Columns found and saved.'
                    f'Starting column name: {start_l_col}'
                    f'Number of lines column: {lines_col}'
                    f'End column name: {end_l_col}'
                    f'Next start column name: {next_start_l_col}'
                    f'Starting column name: {start_p_col}'
                    f'Number of lines column: {lines_col}'
                    f'End column name: {end_p_col}'
                    f'Next start column name: {next_start_p_col}')
                
                # C-5-a -Start another loop
                for index, row in conf_df.iterrows():
                    # C-5-b - check if the number of days assigned for the current readthrough of the current talk is 0
                    if row[f'Readthrough #{rt}'] == 0:
                        # debugging print line - remove later
                        print(f"Skipping row {index} - Readthrough {rt} because it has 0 days.")
                        # if that talk has zero days alloted for that readthrough, skip to the next talk
                        continue
                    
                    # unstated action - if the number of days assigned for the current readthrough is greater than zero, proceed to C-5-c

                    # C-5-c - establish the connection between conf_df and talks_dict using the primary key column of the conf_df
                    talk = talks_dictionary[f"{conf_df.loc[index,'pk']}_lines"]

                    # debugging line - remove later
                    print(f'Connecting conf_df to talks_dict using primary key {conf_df.loc[index,"pk"]}.')

                    # C-5-d - check if the day number is 1
                    if day == 1:
                        # C-5-d-1 - if so, initialize the start column as 1 for every row
                        conf_df[start_l_col] = 1
                        conf_df[start_p_col] = 1
                        # debugging line - remove later
                        print(f'Readthrough {rt} day {day} set at 1.')
                        # C-5-d-1 - then save the end point as the start point (1) plus the number of lines to be read 
                        # the minus 1 at the end ensures that we end at the assigned reading line, not the line after
                        for start, read, end in zip(start_l_col, lines_col, end_l_col):
                            conf_df.loc[index, end] = conf_df.loc[index, start] + conf_df.loc[index, read] - 1
                        # C-5-d-2 - then save today's ending point as the starting point for the next day, to be accessed later
                        for end, next_start in zip(end_l_col, next_start_l_col):
                            conf_df.loc[index, next_start] = conf_df.loc[index, end]
                        # C-5-d-3 - save the end line just calculated as object 'end_line'
                        end_line = conf_df.loc[index, f'r{rt}d{day}_l_end']

                        # debugging line - remove later
                        print(f'Readthrough {rt} day {day} end line set. End line: {end_line}')

                        # C-5-d-4 - find in the talk the paragraph with a running total that is greater than or equal to the end line 
                        end_paragraph_1 = talk.loc[talk['running_lines'] >= end_line].index[0]
                        # C-5-d-5 - find in the talk the paragraph before the one above
                        end_paragraph_2 = talk.loc[talk['running_lines'] >= end_line].index[-1]
                        # C-5-d-6 - determine which paragraph would yield a number of lines being read closer to the target end line and save as end_paragraph
                        if abs(talk.loc[end_paragraph_1,'running_lines'] - end_line) < abs(talk.loc[end_paragraph_2,'running_lines']- end_line):
                            end_paragraph = end_paragraph_1
                        else:
                            end_paragraph = end_paragraph_2
                        # C-5-d-7 - save the selected end_paragraph as the paragraph ending point of the current day of the current readthrough
                        for end_p in end_p_col:
                            conf_df.loc[index, end_p] = end_paragraph
                        
                        for end_p, next_start_p in zip(end_p_col, next_start_p_col):
                            conf_df.loc[index, next_start_p] = conf_df.loc[index, end_p] + 1

                        # debugging line - remove later
                        print(f'Readthrough {rt} day {day} end paragraph set: End paragraph: {end_paragraph}')
                        

                        
                    
                    # C-5-e - if the day number IS NOT 1...
                    else:
                        # C-5-e-1 - access whatever the start column has already been saved as, add the number of lines, and save that as the end point
                        for start, read, end in zip(start_l_col, lines_col, end_l_col):
                            conf_df.loc[index, end] = conf_df.loc[index, start] + conf_df.loc[index, read]
                        # C-5-e-2 - then save today's ending point as the starting point for the next day, to be accessed later
                        for end, next_start in zip(end_l_col, next_start_l_col):
                            conf_df.loc[index, next_start] = conf_df.loc[index, end]
                        # C-5-e-3 - save the end line just calculated as object 'end_line'
                        end_line = conf_df.loc[index, f'r{rt}d{day}_l_end']

                        # debugging line - remove later
                        print(f'Readthrough {rt} day {day} end line set. End line: {end_line}')

                        # C-5-e-4 - find in the talk the paragraph with a running total that is greater than or equal to the end line 
                        end_paragraph_1 = talk.loc[talk['running_lines'] >= end_line].index[0]
                        # C-5-e-5 - find in the talk the paragraph before the one above
                        end_paragraph_2 = talk.loc[talk['running_lines'] >= end_line].index[-1]
                        # C-5-e-6 - determine which paragraph would yield a number of lines being read closer to the target end line and save as end_paragraph
                        if abs(talk.loc[end_paragraph_1,'running_lines'] - end_line) < abs(talk.loc[end_paragraph_2,'running_lines']- end_line):
                            end_paragraph = end_paragraph_1
                        else:
                            end_paragraph = end_paragraph_2
                        # C-5-e-7 - save the selected end_paragraph as the paragraph ending point of the current day of the current readthrough
                        for end_p in end_p_col:
                            conf_df.loc[index, end_p] = end_paragraph

                        for end_p, next_start_p in zip(end_p_col, next_start_p_col):
                            conf_df.loc[index, next_start_p] = conf_df.loc[index, end_p] + 1

                        # debugging line - remove later
                        print(f'Readthrough {rt} day {day} end paragraph set: End paragraph: {end_paragraph}')

                # debugging print line - remove later
                print(f'Readthrough {rt} columns day {day} saved. Looping back.')
            
            # debugging print line - remove later
            print(f'Readthrough {rt} loop finished.')
        
        return conf_df
    
    # runs the get paragraphs function, saves as conf_df_7
    conf_df_7 = get_paragraphs(conf_df_6, all_talks_dict)

    # establishes patterns for getting a list of start and end columns
    start_cols_pat = r'r\d+d\d+_p_start'
    end_cols_pat = r'r\d+d\d+_p_end'

    # create a list of columns names in conf_df_7 
    cols = list(conf_df_7.columns)

    # initialize empty lists for start and end columns
    final_start_cols = []
    final_end_cols = []

    # look at all of the column names in the column names list
    for col in cols:
        # if the column name matches the pattern for start columns, add it to the list of start columns
        if re.search(start_cols_pat, col):
            final_start_cols.append(col)
        # if the column name matches the pattern for end columns, add it to the list of end columns
        elif re.search(end_cols_pat, col):
            final_end_cols.append(col)

    # initialize an empty list for zipping the other two together
        ## zipping two lists together basically entails combining them in the order of list 1 item 1, list 2 item 1, list 1 item 2, list 2 item 2, list 1 item 3, list 2 item 3, etc. 
    zipped_cols = []

    # create a loop that 'zips' the start columns list and end columns list together
    for start_col, end_col in zip(final_start_cols, final_end_cols):
        zipped_cols.append(start_col)
        zipped_cols.append(end_col)
    
    # create a list of necessary information columns
    info_cols = ['Author', 'role', 'Title']

    # combine the info_cols list and the zipped_cols list
    final_cols = info_cols + zipped_cols

    # keep only the columns in the final_cols list, save as final_breakdown
    final_breakdown = conf_df_7[final_cols]

    # Get user input about saving breakdown to computer
    response_1 = input('Do you want to save the breakdown to your computer? ')
    
    # if the user wants to export the breakdown
    if response_1.lower() == 'yes':
        # ask the user where they want to store it
        input_path = input(f'Please paste the location of the folder you would like to save the breakdown in: \n'
                           f'\nFor example: D:\Faith and Religion Stuff\Come, Follow Me')
        # add \\ to that path to make it compatible
        path_start = f'{input_path}\\'
        # ask the user if they want to use a custom name
        response_2 = input('Would you like to save the file with a custom name?')
        # if they do, ask the user for the custom name
        if response_2.lower() == 'yes':
            custom_name = input('Please enter the name you would like to save the file as: ')
            name = f'{custom_name}.csv'
        # otherwise generate a generic file name
        else:
            name = f'{month_year}_breakdown.csv'
        
        # combine the destination folder with the name of the file
        final_path = os.path.join(path_start, name)
        
        # export final_breakdown as a csv to the destination folder
        final_breakdown.to_csv(final_path, index=False)
        
    # Display the final breakdown for viewing in this notebook
    return final_breakdown


Just to document my progress here. 

Importing packages and getting user input for the link works and the start and end dates works. 

Navigating to the link and pulling initial information about each item listed on the conference landing page works. 

Getting rid of fluff like the audit report, session videos, and sustaining of the officers of the Church works. 

The collection of additional information about each talk works. Current run-time is about 2m45s.

Adding primary keys and read and day weights works. Current run-time is about 3m19s.

Storing information about the paragraph line lengths of each talk works like the original notebook does. Current runtime is about 9m. 

Distributing days across readthroughs works. Current runtime is still about 9m.

Distributing lines across each day of each readthrough works. Current run time is about 9m40s. 

Distributing paragraphs across start and end columns based on distributed lines works. Current runtime is 9m3s. 

Subsetting the dataframe to only include Author, role, Title, and the start and end columns works. Current runtime is 9m25s. 

In [34]:
breakdown_gc()

Clicked the play button for The Triumph of Hope to start the media.
Video element found in the DOM.
The Triumph of Hope duration: 811.78 seconds
Paragraph length of The Triumph of Hope: 44 paragraphs.
Line length of The Triumph of Hope: 177 lines.
Role of Neil L. Andersen: Of the Quorum of the Twelve Apostles
Clicked the play button for Live Up to Your Privileges to start the media.
Video element found in the DOM.
Live Up to Your Privileges duration: 705.54 seconds
Paragraph length of Live Up to Your Privileges: 34 paragraphs.
Line length of Live Up to Your Privileges: 170 lines.
Role of Emily Belle Freeman: Young Women General President
Clicked the play button for God’s Favourite to start the media.
Video element found in the DOM.
God’s Favourite duration: 631.40 seconds
Paragraph length of God’s Favourite: 20 paragraphs.
Line length of God’s Favourite: 157 lines.
Role of Karl D. Hirst: Of the Seventy
Clicked the play button for “This Is My Gospel”—“This Is My Church” to start the med

Unnamed: 0,Author,role,Title,r1d1_p_start,r1d1_p_end,r1d2_p_start,r1d2_p_end,r1d3_p_start,r1d3_p_end,r1d4_p_start,...,r2d2_p_start,r2d2_p_end,r2d3_p_start,r2d3_p_end,r3d1_p_start,r3d1_p_end,r3d2_p_start,r3d2_p_end,r3d3_p_start,r3d3_p_end
0,Neil L. Andersen,Of the Quorum of the Twelve Apostles,The Triumph of Hope,1,9,10,23,24,34,35,...,14,32,33,44,1,,,,,
1,Emily Belle Freeman,Young Women General President,Live Up to Your Privileges,1,13,14,33,34,33,34,...,14,33,34,33,1,,,,,
2,Karl D. Hirst,Of the Seventy,God’s Favourite,1,11,12,20,21,20,21,...,12,20,21,20,1,,,,,
3,Dale G. Renlund,Of the Quorum of the Twelve Apostles,“This Is My Gospel”—“This Is My Church”,1,6,7,11,12,16,17,...,8,14,15,22,1,,,,,
4,David P. Homer,Of the Seventy,Trusting Our Father,1,15,16,36,37,36,37,...,16,36,37,36,1,,,,,
5,Gregorio E. Casillas,Of the Seventy,God Loves All His Children,1,7,8,15,16,15,16,...,16,15,16,15,1,,,,,
6,Dallin H. Oaks,First Counselor in the First Presidency,Following Christ,1,9,10,14,15,23,24,...,12,19,20,33,1,,,,,
7,D. Todd Christofferson,Of the Quorum of the Twelve Apostles,Burying Our Weapons of Rebellion,1,8,9,14,15,26,27,...,11,22,23,30,1,,,,,
8,José A. Teixeira,Of the Presidency of the Seventy,Bonded to Jesus Christ: Becoming the Salt of t...,1,10,11,21,22,21,22,...,11,21,22,21,1,,,,,
9,Juan Pablo Villar,Of the Seventy,His Hand Ready to Help Us,1,9,10,18,19,18,19,...,19,18,19,18,1,,,,,
