# Get All Article Links

In [16]:
import requests as r
from bs4 import BeautifulSoup
import pandas as pd
import re

In [17]:
article_results = []
#for year in range(1971,2019):
#    for month in ['04','10']:
for year in [2018]:
    for month in ['04']:
        url = f'https://www.lds.org/general-conference/{year}/{month}?lang=eng'
        response = r.get(url)
        soup = BeautifulSoup(response.text,'lxml')
        article_html = soup.find_all('a', {'class':'lumen-tile__link'})
        for y in range(len(article_html)):
            article_url = 'https://www.lds.org' + article_html[y]['href']
            article_info = [x for x in re.split(r'[\n\t]',article_html[y].text) if x != '']
            if len(article_info) == 1:
                article_title = article_info[0]
                article_author = None
            elif len(article_info) > 2:
                article_title = ' '.join([z for z in article_info[:-1]])
                article_author = article_info[-1]
            else:
                article_title = article_info[0]
                article_author = article_info[1]
            article_results.append({
                'url':article_url,
                'title':article_title,
                'author':article_author,
                'year':year,
                'month':month
            })

df = pd.DataFrame(article_results)
df['text'] = None
df_old = pd.read_csv('all_conference_articles.csv',encoding='utf=8')
df_old.to_csv('all_conference_articles_backup.csv',encoding='utf=8')
df = df_old.append(df)
df.to_csv('all_conference_articles.csv',index=False,encoding='utf-8')

# Add All Article Text + Scripture References + Cross Ref

In [26]:
import requests as r
from bs4 import BeautifulSoup
import pandas as pd
import re

df = pd.read_csv('all_conference_articles.csv')

indexes = df[df.text != df.text].index

for index in indexes:
    url = df.loc[index,'url']
    response = r.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    
    article_text = soup.find('div',{'class':'body-block'})
    if article_text is None:
        df.loc[index,'text'] = 'no_article_text'
        continue
    else:
        article_text = article_text.text
    article_text = re.sub('[\n\t]', ' ', article_text)
    
    df.loc[index,'text'] = article_text
    
    #scripture references
    scripture_refs = soup.find_all('a',{'class':'scripture-ref'})
    scriptures = []
    for x in range(len(scripture_refs)):
        scripture_ref_text = scripture_refs[x].text
        scripture_ref_link = scripture_refs[x]['href']
        scriptures.append({
            'ref':scripture_ref_text,
            'link':scripture_ref_link
        })
    
    df.loc[index,'scriptures'] = str(scriptures)
    
    #cross references
    cross_refs = soup.find_all('a',{'class':'cross-ref'})
    crosses = []
    for x in range(len(cross_refs)):
        cross_ref_text = cross_refs[x].text
        cross_ref_link = cross_refs[x]['href']
        crosses.append({
            'ref':cross_ref_text,
            'link':cross_ref_link
        })
    
    df.loc[index,'cross_refs'] = str(crosses)

In [30]:
df.to_csv('all_conference_articles.csv',index=False,encoding='utf-8')

# Analysis

In [31]:
import pandas as pd
df = pd.read_csv('all_conference_articles.csv',encoding='utf-8')

In [2]:
#to pull these, you will need to configure your download script to pull data differently. It is frequently in a table.
#df_stats = df[df.title.str.contains('Statistical')].copy()

In [32]:
df = df[~df.title.str.contains('Sustaining of Church Officers')]
df = df[~df.title.str.contains('Committee Report')]
df = df[~df.title.str.contains('Auditing Department')]
df = df[~df.title.str.contains('Statistical')]
df = df[~df.title.str.contains('Solemn Assembly')]

In [33]:
df.reset_index(drop=True,inplace=True)

In [6]:
import numpy as np
scriptures = []
count = 0
for x in df.scriptures:
    if x is None or x in [np.nan]:
        count += 1
        continue
    else:
        scripture_list_object = eval(str(x))
        new_list = []
        for script in scripture_list_object:
            script['url'] = df.loc[count,'url']
            new_list.append(script)
        scriptures = scriptures + new_list
        count += 1

In [7]:
scripturedf = pd.DataFrame(scriptures)

In [8]:
import numpy as np
scripturedf['standard_work'] = scripturedf.link.map(lambda x: x.split('/')[2])
scripturedf['book'] = scripturedf.link.map(lambda x: x.split('/')[3])
scripturedf['ref_link'] = scripturedf.link.map(lambda x: x.split('/')[4] if len(x.split('/')) > 4 else np.nan)

In [9]:
import re
scripturedf['chapter'] = scripturedf.ref_link.map(lambda x: re.split('[.?]',str(x))[0])

In [10]:
scripturedf = scripturedf[~scripturedf.chapter.str.contains('-')]
scripturedf = scripturedf[~scripturedf.chapter.str.contains('bofm')]
scripturedf = scripturedf[~scripturedf.chapter.str.contains('nan')]

In [11]:
scripturedf['raw_verse'] = scripturedf.ref_link.map(lambda x: re.split('[.?]',str(x))[1] if len (re.split('[.?]',str(x))) > 1 else np.nan)

In [12]:
def verse_parser(cell):
    final_list = []
    if cell != cell:
        return np.nan
    if 'span' in cell:
        cell = cell.split('=')[1]
    if ',' in cell:
        cell_list = cell.split(',')
        for x in cell_list:
            if '-' in x:
                start = x.split('-')[0]
                end = x.split('-')[1]
                for y in range(int(start),int(end) + 1):
                    final_list.append(y)
            else:
                final_list.append(x)
    else:
        if '-' in cell:
            start = cell.split('-')[0]
            end = cell.split('-')[1]
            for y in range(int(start),int(end) + 1):
                final_list.append(y)
        else:
            final_list.append(cell)
    return final_list
            
scripturedf['ref_list'] = scripturedf.raw_verse.map(verse_parser)

In [13]:
import numpy as np
all_scriptures = []
for x in scripturedf.index:
    if scripturedf.loc[x,'ref_list'] != None and scripturedf.loc[x,'ref_list'] not in [np.nan]:
        for verse in scripturedf.loc[x,'ref_list']:
            book = scripturedf.loc[x,'book']
            chapter = scripturedf.loc[x,'chapter']
            reference = str(book) + ' ' + str(chapter) + ':' + str(verse)
            all_scriptures.append({
                'reference':reference,
                'link':scripturedf.loc[x,'link'],
                'url':scripturedf.loc[x,'url']
            })

In [14]:
#most cited scripture references
#31,102 verses in bible
#6604 verses in book of mormon
#3654 verses in d and c
#635 verses in p of g p

#41995 verses in total

#13606 verses have been cited in general conference
#about 1/3 of scriptures have been cited in general conference

#10108 scriptures have been cited less than 5 times
#only 3498 scriptures have been cited more than 5 times

#only 399 scriptures have been cited more than 25 times

#only 89 scriptures have been cited more than 50 times

#only 9 scriptures have been cited more than 100 times

In [15]:
%matplotlib inline
scripture_count_df = pd.DataFrame(all_scriptures)
# scripture_count_df.reference.value_counts().hist(bins=100)
# scripture_count_df.reference.value_counts()

In [16]:
#merging scriptures back into the main dataframe
#this allows me to filter based on speaker, talk, year, look at yearly trends, etc.
final_scripture_df = df.merge(scripture_count_df,on=['url'],how='left')

In [24]:
final_scripture_df.to_csv('')

Unnamed: 0,author,month,title,url,year,text,scriptures,link,reference
0,Joseph Fielding Smith,4,Out of the Darkness,https://www.lds.org/general-conference/1971/04...,1971,"My dear brothers and sisters: We welcome you,...","[{'ref': 'D&C 1:30', 'link': '/scriptures/dc-t...",/scriptures/dc-testament/dc/1.30?#29,dc 1:30
1,Joseph Fielding Smith,4,Out of the Darkness,https://www.lds.org/general-conference/1971/04...,1971,"My dear brothers and sisters: We welcome you,...","[{'ref': 'D&C 1:30', 'link': '/scriptures/dc-t...",/scriptures/nt/acts/10.34-35?#33,acts 10:34
2,Joseph Fielding Smith,4,Out of the Darkness,https://www.lds.org/general-conference/1971/04...,1971,"My dear brothers and sisters: We welcome you,...","[{'ref': 'D&C 1:30', 'link': '/scriptures/dc-t...",/scriptures/nt/acts/10.34-35?#33,acts 10:35
3,Spencer W. Kimball,4,"Voices of the Past, of the Present, of the Future",https://www.lds.org/general-conference/1971/04...,1971,"Beloved brothers and sisters and friends, I f...","[{'ref': '1 Cor. 14:8–10', 'link': '/scripture...",/scriptures/nt/1-cor/14.8-10?#7,1-cor 14:8
4,Spencer W. Kimball,4,"Voices of the Past, of the Present, of the Future",https://www.lds.org/general-conference/1971/04...,1971,"Beloved brothers and sisters and friends, I f...","[{'ref': '1 Cor. 14:8–10', 'link': '/scripture...",/scriptures/nt/1-cor/14.8-10?#7,1-cor 14:9


In [213]:
import string
remove = string.punctuation
remove = remove + '“”—…’'
df['title_no_punctuation'] = df['title'].map(lambda x: x.translate({ord(char): None for char in remove}))

In [214]:
titles = []
for x in df.title_no_punctuation:
    if x is None or x != x:
        continue
    else:
        titles = titles + x.lower().split(' ')

In [215]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

In [216]:
titles = pd.Series(titles)
titles = titles[~titles.isin(english_stopwords)]

In [217]:
#most frequently used words in titles after removing punctuation and stopwords
titles.value_counts()

god               149
lord              117
priesthood        109
christ             99
faith              97
church             80
love               80
power              71
ye                 71
jesus              65
gospel             62
family             56
life               55
us                 54
testimony          53
things             52
light              49
work               48
blessings          48
come               48
eternal            46
good               46
children           45
home               42
welfare            42
shall              41
service            40
time               40
way                40
one                40
                 ... 
lining              1
ends                1
age                 1
confirmed           1
sufficient          1
start               1
climb               1
television          1
circle              1
sifting             1
selfless            1
tool                1
emerge              1
broke               1
lifestep  