# GeneralConferenceTalk

> Simply by entering a url for a talk, both a txt doc and metadata can be harvested.

In [None]:
#| default_exp GeneralConferenceTalk

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from bs4 import BeautifulSoup
import requests
import re

class GeneralConferenceTalk:
    """
    Represents a talk from the Church of Jesus Christ of Latter-day Saints' General Conference.

    Args:
        url (str): The URL of the talk's transcript.
        title (bool, optional): Whether to extract and return the talk's title. Default is True.
        author (bool, optional): Whether to extract and return the talk's author's name. Default is True.
        calling (bool, optional): Whether to extract and return the talk's author's calling. Default is True.

    Attributes:
        url (str): The URL of the talk's transcript.
        title (bool): Flag to include the title or not.
        author (bool): Flag to include the author's name or not.
        calling (bool): Flag to include the author's calling or not.
        text (str): The extracted text from the talk's transcript.
        metadata (dict): A dictionary containing metadata about the talk, including title, author, calling, year, month, and URL.

    Methods:
        _extract_text(self) -> str: Extracts the text content from the talk's transcript.
        _extract_metadata(self) -> dict: Extracts metadata about the talk from the talk's transcript.
    """
    def __init__(self, url, title=True, author=True, calling=True):
        self.url = url
        self.title = title
        self.author = author
        self.calling = calling
        self.text = self._extract_text()
        self.metadata = self._extract_metadata()

    def _extract_text(self):
        """
        Extracts the text from the given URL using BeautifulSoup and regular expressions.

        Returns:
            str: The extracted text content of the talk.
        """
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all paragraphs with IDs starting with "p"
        title_header = soup.find_all('h1')
        author1 = soup.find_all('p', id='author1')
        author2 = soup.find_all('p', id='author2')
        paragraphs = soup.find_all('p', id=lambda x: x and x.startswith('p'))

        # Create a list to store the extracted paragraphs
        text_paragraphs = []

        # Extract text from title header if title argument is True
        if self.title and title_header:
            text_title_header = title_header[0].get_text().strip()
            text_paragraphs.append(text_title_header)

        # Extract text from author1 and author2 if author and calling arguments are True
        if self.author and author1:
            text_author1 = author1[0].get_text().strip()
            text_paragraphs.append(text_author1)

        if self.calling and author2:
            text_author2 = author2[0].get_text().strip()
            text_paragraphs.append(text_author2)

        # Extract text from each paragraph
        for paragraph in paragraphs:
            text_paragraphs.append(paragraph.get_text())

        # Concatenate all paragraphs into a single text
        result_text_paragraphs = []
        if self.title:
            if self.author and not self.calling:
                result_text_paragraphs.append(text_paragraphs[0:2])
                result_text_paragraphs.append(text_paragraphs[3:])
            elif not self.author and self.calling:
                result_text_paragraphs.append(text_paragraphs[0:1])
                result_text_paragraphs.append(text_paragraphs[2:])
            elif not self.author and not self.calling:
                result_text_paragraphs.append(text_paragraphs[0:1])
                result_text_paragraphs.append(text_paragraphs[3:])
            else:
                result_text_paragraphs.append(text_paragraphs)
        else:
           if self.author and not self.calling:
                result_text_paragraphs.append(text_paragraphs[0:1])
                result_text_paragraphs.append(text_paragraphs[2:])
           elif not self.author and self.calling:
                result_text_paragraphs.append(text_paragraphs[1:])
           elif not self.author and not self.calling:
                result_text_paragraphs.append(text_paragraphs[2:])
           else:
                result_text_paragraphs.append(text_paragraphs) 

        # Flatten the list using a list comprehension
        result_text_paragraphs = [item for sublist in result_text_paragraphs for item in sublist]
            
        extracted_text = '\n\n'.join(result_text_paragraphs)

        return extracted_text

    def _extract_metadata(self):
        """
        Extracts various metadata from the talk page.

        Returns:
            dict: A dictionary containing metadata such as title, author, calling, year, month, and url.
        """
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize a dictionary to store metadata
        metadata = {}

        # Extract title if available
        if self.title:
            title_header = soup.find('h1')
            metadata['title'] = title_header.get_text().strip() if title_header else None

        # Extract author if available
        if self.author:
            author_name = soup.find('p', id='author1')
            if not author_name:
                author_name = soup.find('p', class_='author-name')
            metadata['author'] = author_name.get_text().replace("By ", "").replace('Sister', '').replace('Elder', '').replace('President', '').replace('\xa0', ' ').strip() if author_name else None
            

        # Extract calling if available
        if self.calling:
            author_calling = soup.find('p', id='author2')
            if not author_calling:
                author_calling = soup.find('p', class_='author-role')
            metadata['calling'] = author_calling.get_text().strip() if author_calling else None
        
        # Extract year and month from URL
        match = re.search(r"/(\d{4})/(\d{2})", self.url)
        if match:
            metadata['year'] = int(match.group(1))
            metadata['month'] = int(match.group(2))

        # Include URL
        metadata['url'] = self.url
        
        return metadata

In [None]:
# As an example
url = "https://www.churchofjesuschrist.org/study/general-conference/2024/04/15dushku?lang=eng"

talk = GeneralConferenceTalk(url, title=True, author=True, calling=True)

# Print the extracted text
print("**** Metadata ****  \n")
print(talk.metadata)
print("\n")

print("**** Extracted Text **** \n")
print(talk.text[0:300])

**** Metadata ****  

{'title': 'Pillars and Rays', 'author': 'Alexander Dushku', 'calling': 'Of the Seventy', 'year': 2024, 'month': 4, 'url': 'https://www.churchofjesuschrist.org/study/general-conference/2024/04/15dushku?lang=eng'}


**** Extracted Text **** 

Pillars and Rays

By Elder Alexander Dushku

Of the Seventy

My message is for those who worry about their testimony because they haven’t had overwhelming spiritual experiences. I pray that I can provide some peace and assurance.

The Restoration of the gospel of Jesus Christ began with an explosion


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()