## get html content

In [14]:
import requests
from bs4 import BeautifulSoup

urls = [
    "https://www.kennedy-center.org/education/resources-for-educators/classroom-resources/media-and-interactives/media/music/your-brain-on-music/your-brain-on-music/your-brain-on-music-tearjerkers/",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3779798/",
    "https://www.psypost.org/new-research-uncovers-atonal-musics-distinct-emotional-and-neural-effects/",
    "https://online.ucpress.edu/mp/article/40/3/202/195230/The-Perceptual-and-Emotional-Consequences-of",
    "https://dl.acm.org/doi/fullHtml/10.1145/3461615.3485419",
    "https://www.unprofesor.com/musica/tipos-de-cadencia-musical-3912.html?utm_source=chatgpt.com#anchor_1",
    "https://eldiaadiariomusica.wordpress.com/2013/06/23/sentido-y-personalidad-de-las-tonalidades/"
]

In [7]:
def extract_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    except Exception as e:
        print(f"Error procesando {url}: {e}")
        return None

In [8]:
knowledge_base = {}
for url in urls:
    knowledge_base[url] = extract_text(url)

In [11]:
# print(knowledge_base)
# output:
# {'https://www.kennedy-center.org/education/resources-for-educators/classroom-resources/media-and-interactives/media/music/your-brain-on-music/your-brain-on-music/your-brain-on-music-tearjerkers/': 'Just a moment...Enable JavaScript and cookies to continue',
#  'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3779798/': '403403 Forbidden',
#  'https://www.psypost.org/new-research-uncovers-atonal-musics-distinct-emotional-and-neural-effects/': '\n403 Forbidden\n\n403 Forbidden\nnginx\n\n\n',

Since the output contains errors that BeautifulSoup cannot handle, such as '403 Forbidden' or 'Just a moment... Enable JavaScript and cookies to continue', we will explore alternative methods to extract the URL content.

In [None]:
%pip install selenium

In [None]:
%pip install webdriver-manager selenium

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def extract_text_selenium(url):
    options = Options()
    options.add_argument("--headless")  # No abrir ventana del navegador
    options.add_argument("--disable-blink-features=AutomationControlled")

    # WebDriver Manager descarga la versión correcta de ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        page_text = driver.find_element("tag name", "body").text
        return page_text
    except Exception as e:
        print(f"Error procesando {url}: {e}")
        return None
    finally:
        driver.quit()



In [3]:
#example
url = "https://www.psypost.org/new-research-uncovers-atonal-musics-distinct-emotional-and-neural-effects/"
text = extract_text_selenium(url)
print(text[:500])  


SUBSCRIBE
The latest psychology and neuroscience discoveries.
MY ACCOUNT
MENTAL HEALTH
SOCIAL PSYCHOLOGY
COGNITIVE SCIENCE
PSYCHOPHARMACOLOGY
NEUROSCIENCE
ABOUT
Home Exclusive Music
New research uncovers atonal music’s distinct emotional and neural effects
by Eric W. Dolan May 30, 2024 in Music
(Photo credit: Adobe Stock)
Stay on top of the latest psychology findings: Subscribe now!
A recent study published in the journal Behavioral Neuroscience has shed light on how atonal music affects our emo


In [15]:
knowledge_base = {url: extract_text_selenium(url) for url in urls}

In [24]:
knowledge_base['https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3779798/'][:100]

"An official website of the United States government\nHere's how you know\n\n\nSearch\nLog in\nSearch in PM"

Although we manage to retrieve the HTML code for some, there are others who are still denied

In [None]:
# 'https://online.ucpress.edu/mp/article/40/3/202/195230/The-Perceptual-and-Emotional-Consequences-of': 
# 'online.ucpress.edu\nVerificar que usted es un ser humano. Esto podría tardar algunos segundos.\nonline.ucpress.edu necesita revisar la seguridad de su conexión antes de continuar.
# \nRay ID: 90bd3bb24cf22f8e\nRendimiento y seguridad de Cloudflare'

In [25]:
import json

with open("data/knowledge_base.json", "w", encoding="utf-8") as file:
    json.dump(knowledge_base, file, indent=4, ensure_ascii=False)

## clean the text

In [29]:
with open("data/knowledge_base.json", "r", encoding="utf-8") as file:
    raw_data = json.load(file)

In [33]:
# raw_data

In [32]:
del raw_data['https://online.ucpress.edu/mp/article/40/3/202/195230/The-Perceptual-and-Emotional-Consequences-of']
del raw_data['https://www.kennedy-center.org/education/resources-for-educators/classroom-resources/media-and-interactives/media/music/your-brain-on-music/your-brain-on-music/your-brain-on-music-tearjerkers/']

In [34]:
import re

def preprocess_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    text = re.sub(r'<[^>]+>', '', text)

    text = re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF\s.,!?;:'-]", '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [35]:
clean_data = {}
for key, value in raw_data.items():
    clean_data[key] = preprocess_text(value)

In [37]:
# clean_data

In [38]:
with open("data/clean_data.json", "w", encoding="utf-8") as file:
    json.dump(clean_data, file, indent=4, ensure_ascii=False)

## concatenate JSON data

we have harcoded an json file that stores essential information about musical modes, including Ionian, Dorian, and others.

In [39]:
path_music_modes = "data/music_modes.json"
path_clean_data = "data/clean_data.json"

with open(path_music_modes, "r", encoding="utf-8") as file:
    music_modes = json.load(file)

with open(path_clean_data, "r", encoding="utf-8") as file:
    clean_data = json.load(file)


In [43]:
merged_data = {**music_modes, **clean_data}
# merged_data

In [44]:
path_merge_data = "data/merged_data.json"

with open(path_merge_data, "w", encoding="utf-8") as file:
    json.dump(merged_data, file, indent=4, ensure_ascii=False)