# Analysing the Danish Højskolesangbog
** An exam project for Cultural Data Science at the Faculty of Arts, AU **
\ 
This script contains the code for scraping data from højskolesangbogen.dk and transforming it into a clean data set containing song lyrics, title, metadata and word count vectors

In [180]:
# Importing libraries and dependencies
import numpy as np
import pandas as pd
import re

import joblib

import requests
from bs4 import BeautifulSoup

import string
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jakobgrohn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Obtaining digital versions of song lyrics from højskolesangbogen.dk using automated webscraping

In [2]:
# Importing first column from csv file containing URLs from hojskolesangbogen
urls = pd.read_csv('Data/song_urls.csv', usecols=[0])

In [3]:
# Inspecting data to insure everything is correct
print(range(len(urls)))
urls.head()

range(0, 593)


Unnamed: 0,Link
0,https://www.hojskolesangbogen.dk/om-sangbogen/...
1,https://www.hojskolesangbogen.dk/om-sangbogen/...
2,https://www.hojskolesangbogen.dk/om-sangbogen/...
3,https://www.hojskolesangbogen.dk/om-sangbogen/...
4,https://www.hojskolesangbogen.dk/om-sangbogen/...


In [144]:
# Creating a smaller subset of the urls
urls_sub = urls.head(100)

In [230]:
# Creating empty df for lyrics
song_lyrics_df = pd.DataFrame(columns = ['lyrics', 'title'])

# Loop through each url
for n in range(len(urls)):
    song_url = requests.get(str(urls.iat[n,0])) 
    soup = BeautifulSoup(song_url.content, 'html.parser')
    song_title = soup.select('h1')[0].text.strip() # Find title (first h1)
    song_lyrics = '' # Create empty string for lyrics
    for i in range(len(soup.select('h2,.caption'))): # Loop through all h2 and .caption elements 
        element_text = soup.select('h2,.caption')[i].text.strip() 
        if element_text == song_title: # If element is identical to title, this must be the title of the box containing song lyrics
            song_lyrics += soup.select('h2,.caption')[i-1].text.strip() # Add the element before to the variable containing lyrics
    song_lyrics_df.loc[n] = [song_lyrics] + [song_title]

In [231]:
# Removing white spaces
song_lyrics_df['lyrics'] = song_lyrics_df['lyrics'].str.replace('\n',' ')
song_lyrics_df['lyrics'] = song_lyrics_df['lyrics'].str.replace('\r',' ')
song_lyrics_df['lyrics'] = song_lyrics_df['lyrics'].str.replace('\t',' ')

In [232]:
# Coutning number of songs missing lyrics
len(song_lyrics_df[song_lyrics_df['lyrics'] == ''])   

173

In [233]:
# Creating empty df for data about song composition
song_data_df = pd.DataFrame(columns = ['text','melody'])

# Loop through all URLS
for n in range(len(urls)):
    song_url = requests.get(str(urls.iat[n,0])) 
    soup = BeautifulSoup(song_url.content, 'html.parser')
    song_melody = '' # Create empty string for data about melody
    song_text = '' # Create empty string for data about writing of the song
    for i in range(len(soup.select('p'))): # Loop through all 'p' elements
        # If element starts with 'Melodi:' or 'Melodier:' or 'Komponist then
        if soup.select('p')[i].text.startswith('Melodi:') or soup.select('p')[i].text.startswith('Melodier:') or soup.select('p')[i].text.startswith('Komponist:'): 
            song_melody += soup.select('p')[i].text.strip() # add to the variable containing data about melody
         # Same procedure for data about the writing of the song
        if soup.select('p')[i].text.startswith('Tekst:') or soup.select('p')[i].text.startswith('Tekster:') or soup.select('p')[i].text.startswith('Forfatter:'):
            song_text += soup.select('p')[i].text.strip()
        # If element starts with 'Tekst og melodi:' than add same data to both variables
        if soup.select('p')[i].text.startswith('Tekst og melodi:'):
            song_text += soup.select('p')[i].text.strip()
            song_melody += soup.select('p')[i].text.strip()
    song_data_df.loc[n] = [song_text] + [song_melody] # Add strings containing relevant data to the data frame

In [234]:
# Inspecting the result
song_data_df.head(12)

Unnamed: 0,text,melody
0,"Tekst: N.F.S. Grundtvig, 1826","Melodi: C.E.F. Weyse, 1826 og Erik Sommer, 1983"
1,"Tekst: Hans Christensen Sthen, 1589","Melodi: August Winding, 1874"
2,"Tekst: Thomas Kingo, 1674","Melodier: H.O.C. Zinck, 1801 (eller som nr. 20)"
3,"Tekst: Thomas Kingo, 1674","Melodi: Hamburg, 1690"
4,"Tekst: N.F.S. Grundtvig, 1833","Melodi: Oluf Ring, 1932"
5,"Tekst: N.F.S. Grundtvig, 1853","Melodi: Thomas Laub, 1916"
6,"Tekst: B.S. Ingemann, 1937","Melodi: C.E.F. Weyse, 1937"
7,"Tekst: B.S. Ingemann, 1837","Melodi: C.E.F. Weyse, 1837"
8,"Forfatter: B. S. Ingemann, 1837","Komponist: C.E.F. Weyse, 1837"
9,"Tekst: B.S. Ingemann, 1837","Melodi: C.E.F. Weyse, 1837"


In [235]:
# Removing unwanted parts of stirngs
song_data_df['melody'] = song_data_df['melody'].str.replace('Tekst og melodi:', '').str.replace('Melodi:', '').str.replace('Melodier:','').str.replace('Komponist:','')
song_data_df['text'] = song_data_df['text'].str.replace('Tekst og melodi:', '').str.replace('Tekst:', '').str.replace('Tekster:','').str.replace('Forfatter:','').str.replace('fra 2. århundrede', '200')

# Loop through and remove secondary composer and writer
#for i in range(len(song_data_df['melody'])):
#    if 'og' in song_data_df['melody'][i]:
#        song_data_df['melody'][i] = song_data_df['melody'][i].split(' og ', 1)[0]
#    if 'og' in song_data_df['text'][i]:
#        song_data_df['text'][i] = song_data_df['text'][i].split(' og ', 1)[0]
#    if 'oversat af' in song_data_df['text'][i]:
#        song_data_df['text'][i] = song_data_df['text'][i].split('oversat af', 1)[0]

In [236]:
# Inspecting data to see if it looks fine
song_data_df.head(11)

Unnamed: 0,text,melody
0,"N.F.S. Grundtvig, 1826","C.E.F. Weyse, 1826 og Erik Sommer, 1983"
1,"Hans Christensen Sthen, 1589","August Winding, 1874"
2,"Thomas Kingo, 1674","H.O.C. Zinck, 1801 (eller som nr. 20)"
3,"Thomas Kingo, 1674","Hamburg, 1690"
4,"N.F.S. Grundtvig, 1833","Oluf Ring, 1932"
5,"N.F.S. Grundtvig, 1853","Thomas Laub, 1916"
6,"B.S. Ingemann, 1937","C.E.F. Weyse, 1937"
7,"B.S. Ingemann, 1837","C.E.F. Weyse, 1837"
8,"B. S. Ingemann, 1837","C.E.F. Weyse, 1837"
9,"B.S. Ingemann, 1837","C.E.F. Weyse, 1837"


In [237]:
# Splitting each column into two seperate columns
song_data_df = pd.concat([song_data_df['text'].str.split(',', expand=True), song_data_df['melody'].str.split(',', expand=True)], axis=1)

In [240]:
song_data_df

Unnamed: 0,0,1,0.1,1.1
0,N.F.S. Grundtvig,1826,C.E.F. Weyse,1826 og Erik Sommer
1,Hans Christensen Sthen,1589,August Winding,1874
2,Thomas Kingo,1674,H.O.C. Zinck,1801 (eller som nr. 20)
3,Thomas Kingo,1674,Hamburg,1690
4,N.F.S. Grundtvig,1833,Oluf Ring,1932
...,...,...,...,...
588,Ole Hyltoft,2008,Michael Bojesen,2009
589,Suzanne Brøgger,2004,Katrine Muff Enevoldsen,2016
590,,,,
591,Halfdan Rasmussen,1954,Bent Fabricius-Bjerre,2015


In [239]:
# Delete potenital rows containing leftover values from ','-split
if len(song_data_df.columns) > 4:
    del song_data_df[2]
if len(song_data_df.columns) > 4:
    del song_data_df[3]
if len(song_data_df.columns) > 4:
    del song_data_df[4]


In [241]:
# Rename columns to something more understandable
song_data_df.columns = ['songwriter', 'year_written', 'composer', 'year_composed']

In [242]:
# Keep only one number in the 'year' columns
for i in range(len(song_data_df)):
    if song_data_df['year_composed'][i] == None:
        song_data_df['year_composed'][i] = 'No data'
    if song_data_df['year_written'][i] == None:
        song_data_df['year_written'][i] = 'No data'
    if re.search('\d\d\d\d', song_data_df['year_composed'][i]):
        only_year_composed = re.search('\d\d\d\d', song_data_df['year_composed'][i]) 
        song_data_df['year_composed'][i] = only_year_composed.group()
    if re.search('\d\d\d\d', song_data_df['year_written'][i]):
        only_year_written = re.search('\d\d\d\d', song_data_df['year_written'][i]) 
        song_data_df['year_written'][i] = only_year_written.group()

In [243]:
# Merging data with title and lyrics data
clean_song_data = pd.merge(song_data_df, song_lyrics_df, left_index=True, right_index=True)

In [244]:
clean_song_data

Unnamed: 0,songwriter,year_written,composer,year_composed,lyrics,title
0,N.F.S. Grundtvig,1826,C.E.F. Weyse,1826,Den signede dag 1. Den signede dag med fryd...,Den signede dag
1,Hans Christensen Sthen,1589,August Winding,1874,Den mørke nat forgangen er 1. Den mørke nat...,Den mørke nat forgangen er
2,Thomas Kingo,1674,H.O.C. Zinck,1801,Nu rinder solen op 1. Nu rinder solen opaf ...,Nu rinder solen op
3,Thomas Kingo,1674,Hamburg,1690,Vågn op og slå på dine strenge 1. Vågn op o...,Vågn op og slå på dine strenge
4,N.F.S. Grundtvig,1833,Oluf Ring,1932,Morgenhanen atter gol 1. Morgenhanen atter ...,Morgenhanen atter gol
...,...,...,...,...,...,...
588,Ole Hyltoft,2008,Michael Bojesen,2009,Nirvana 1. Aftenhavets brede bueer mit alte...,Nirvana
589,Suzanne Brøgger,2004,Katrine Muff Enevoldsen,2016,"Så, min sol, gå bare ned 1. Så, min sol, gå...","Så, min sol, gå bare ned"
590,,No data,,No data,Dagen slipper grebet 1. Dagen slipper grebe...,Dagen slipper grebet
591,Halfdan Rasmussen,1954,Bent Fabricius-Bjerre,2015,Aftenbøn 1. Kom nat med søvn og hvile.Gør s...,Aftenbøn


In [245]:
# Extracting to pdf
clean_song_data.to_csv('Data/cleaned_data.csv')

# Tokenising and creating BoW count matrix

In [246]:
# Define tokenizer function that simultaneously strips away punctuation
def tokenizer_better(text):    
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, ' '))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

In [247]:
# Defining vecotriser
bow_vect = CountVectorizer(tokenizer = tokenizer_better)

# Fit vectorizer to song lyrics
bow_vect.fit(clean_song_data['lyrics'].values)

# Transform lyrics into word count matrix
bow_vect_lyrics = bow_vect.transform(clean_song_data['lyrics'].values)

In [248]:
print('Note after vectorization: \n{}'.format(bow_vect_lyrics[1]))

Note after vectorization: 
  (0, 29)	5
  (0, 139)	4
  (0, 174)	1
  (0, 313)	1
  (0, 499)	1
  (0, 731)	1
  (0, 1219)	1
  (0, 1404)	1
  (0, 1407)	2
  (0, 1414)	2
  (0, 1522)	2
  (0, 1553)	3
  (0, 1561)	1
  (0, 1585)	1
  (0, 1613)	1
  (0, 1632)	2
  (0, 1658)	1
  (0, 1964)	1
  (0, 2221)	3
  (0, 2250)	1
  (0, 2270)	1
  (0, 2370)	1
  (0, 2798)	3
  (0, 3055)	1
  (0, 3077)	1
  :	:
  (0, 8518)	1
  (0, 8666)	1
  (0, 8900)	5
  (0, 8912)	1
  (0, 9293)	1
  (0, 9326)	6
  (0, 9372)	1
  (0, 9528)	1
  (0, 10059)	1
  (0, 10570)	1
  (0, 10760)	1
  (0, 11070)	2
  (0, 11359)	1
  (0, 11386)	2
  (0, 11428)	5
  (0, 11463)	1
  (0, 11475)	1
  (0, 11991)	1
  (0, 12272)	1
  (0, 12275)	1
  (0, 12453)	2
  (0, 12463)	1
  (0, 12493)	1
  (0, 12695)	1
  (0, 12863)	1


In [249]:
# Saving  vocabulary pkl file
vocab_path = "Data/song_vocabulary.pkl"
with open(vocab_path, 'wb') as fw:
     joblib.dump(bow_vect.vocabulary_, fw)

In [250]:
# Loading vacbulary back in and changing it to data frame
vocab = np.load('Data/song_vocabulary.pkl', allow_pickle=True)

# Changing vocabulary to pandas df
vocab = pd.DataFrame(vocab.items())

# Chaning column names
vocab.columns = ['Vocab', 'Word']

# Saving vocabulary df
vocab.to_csv('Data/song_vocabulary.csv')

In [251]:
# Turning sparse matrix into array and then pandas data frame
bow_vect_lyrics_df = pd.DataFrame(bow_vect_lyrics.toarray())

In [252]:
# Merging words counts with general song data
clean_song_data_with_word_counts = pd.merge(clean_song_data, bow_vect_lyrics_df, left_index=True, right_index=True)

In [253]:
# Saving full data set as csv file
clean_song_data_with_word_counts.to_csv('Data/clean_song_data_with_word_counts.csv')