In [106]:
import requests
from io import BytesIO
from zipfile import ZipFile
import re
import polars as pl

# Load data
url = "http://scriptures.nephi.org/downloads/lds-scriptures.csv.zip"
response = requests.get(url)
zipfile = ZipFile(BytesIO(response.content))
csv_file = zipfile.open('lds-scriptures.csv')
scriptures_data = pl.read_csv(csv_file)

# Load Parquet file
sav_names = pl.read_parquet("/Users/homeserver/PycharmProjects/replication_projects/savior_names/BoM_SaviorNames.parquet")

# Create pattern
jesuspat = "|".join(sav_names['name'].to_list())

# Filter for Book of Mormon
bofm = scriptures_data.filter(pl.col('volume_lds_url') == 'bm')

def get_text(book):
    return " ".join(bofm.filter(pl.col('book_lds_url') == book)['scripture_text'].to_list())

def break_sav(phrase, pattern):
    temp = re.split(pattern, phrase)
    word_count = [len(re.findall(r'\S+', segment)) for segment in temp]
    return pl.DataFrame({'text': temp, 'word_count': word_count})

def get_words(book):
    df = break_sav(get_text(book), jesuspat)
    df = df.with_columns(pl.lit(book).alias('book'))
    return df

bom_books = bofm['book_lds_url'].unique().to_list()
words_in_table = pl.concat([get_words(book) for book in bom_books])
words_in_table



Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance.



text,word_count,book
str,i64,str
"""Now it came to pass that the n…",397,"""3-ne"""
""" in behalf of his people, yea,…",34,"""3-ne"""
""" all that day; and behold, the…",9,"""3-ne"""
""" came unto him, saying: Lift u…",97,"""3-ne"""
""" and of the """,3,"""3-ne"""
…,…,…
""" knoweth the things which we h…",75,"""morm"""
""", are according to the prayers…",19,"""morm"""
""" grant that their prayers may …",13,"""morm"""
""" remember the covenant which h…",24,"""morm"""


In [111]:
import plotly.express as px

# Assuming 'df' is your DataFrame and 'column_name' is the name of the column you want to group by
words_in_table.group_by('book').mean().sort('word_count')

# get rid of word_count outliers
words_in_table = words_in_table.filter(pl.col('word_count') <= 250)


fig = px.box(words_in_table,
             y='word_count',
             log_y=True,
             x='book',
             title="Distance Between Savior's Names by Book in the Book of Mormon",
             points=False,
             color='book',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'word_count': 'Word Count', 'book': 'Book Name'})

fig.update_xaxes(categoryorder='array', 
                 categoryarray=words_in_table.group_by('book')
                                             .mean()
                                             .sort('word_count')['book'])
# make y axis normal
fig.update_yaxes(type='linear')

fig.update_layout(
    yaxis_title="Word Count",
    xaxis_title="Book name",
    template="plotly_white"
)

# Show the figure
fig.show()