In [8]:
## Code to extract example sentences from Oxford Learners Dictionary
## James Fodor 2022
## Python 3.8

import numpy as np
from bs4 import BeautifulSoup, Tag, NavigableString
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.chrome.service import Service

# Define options and variables for selenium
options = Options()
driver_path = Service('D:\Downloads\Archive\edgedriver_win64\msedgedriver.exe') # needed for selenium, must be updated version
options.add_argument("--headless") 
driver = webdriver.Edge(service=driver_path, options=options)
oxford_url = 'https://www.oxfordlearnersdictionaries.com/definition/english/'

# File path for where vocab file is stored
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

In [10]:
## Define key functions

# Load word similarity dataset
def load_sim_dataset(model):
    path = path_base+'Word Similarity Data\Word Similarities Final\\'
    filename = path+model+'.txt'
    with open(filename) as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        i=i+1
    ratings = np.array(ratings)

    return(wordpairs, ratings)

# Open web page for a given url
def get_web_page(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    part_of_speech = soup.find_all(class_="pos")
    all_results = soup.find_all(class_="sense")
    return(all_results, part_of_speech[0].text)

# Extract example sentences from a web page
def extract_sense_examples(raw_html):
    sense_dict = {}
    for index, result in enumerate(raw_html):
        examples_list = []
        examples = result.find_all(class_="x")
        for example in examples:
            examples_list.append(example.text)
        extra_examples = result.find_all(class_="unx")
        for example in extra_examples:
            examples_list.append(example.text)
        if len(result.find_all(class_="def"))>0:
            sense_dict[index+1, result.find_all(class_="def")[0].text] = examples_list
    return(sense_dict)

# Save example sentences to file
def save_sense_examples(sense_dict, word):
    for sense in sense_dict.keys():
        sense_id = sense[0]
        file_name = word+'_'+str(sense_id)+'.txt'
        save_file = open(file_name, "a", encoding='utf-8')
        i=0
        for line in sense_dict[sense]:
            save_file.writelines(line)
            save_file.write('\n')
            i=i+1
        save_file.close()

In [5]:
# Load vocab set
dataset_name = 'SimVerb_mod'
dataset, _ = load_sim_dataset('EN-SimVerb-3200-mod-uk')
vocab = []
for word_pair in dataset:
    vocab.append(word_pair[0])
    vocab.append(word_pair[1])
vocab_set = list(set(vocab))
vocab_set.sort()
print(dataset_name+' vocab loaded')
print(str(len(vocab_set))+' words')

SimVerb_mod vocab loaded
822 words


In [None]:
# Extract sense examples from the Oxford Learners Dictionary website
for word in vocab_set:
    url = oxford_url+word+'_1'
    all_results, part_of_speech = get_web_page(url)
    if part_of_speech=='verb' or part_of_speech=='linking verb' or word=='pup': # exclude weird word 'pup'
        sense_dict = extract_sense_examples(all_results)
    else: # try another number if the first one isn't a verb
        url = oxford_url+word+'_2'
        all_results, part_of_speech = get_web_page(url)
        sense_dict = extract_sense_examples(all_results)
    save_sense_examples(sense_dict, word)