In [1]:
#1. Basic data cleaning: 
#    - Remove newlines, multiple whitespaces, idiosyncratic phrases ('Det talte ord gælder')
#2. Filter out speeches not in Danish
#3. Calculate mean word entropy of the speeches
#(4. Calculate other metrics (readability, mean word length etc) using TextDescriptives) 

import os
import re
import json
import math
# import textdescriptives as td
import pandas as pd

from collections import Counter, defaultdict

In [None]:
data = pd.read_csv("new_data.csv")


In [3]:
data

Unnamed: 0.1,Unnamed: 0,artist,gender,index,song,year,genre,lyrics,lang
0,0,fleetwood-mac,M,124916,without-you,1968,Rock,I'm crazy for my baby\nBut my baby she don't l...,en
1,1,fleetwood-mac,M,125115,temporary-one,1997,Rock,Written by christine mcvie and eddy quintela.\...,en
2,2,fleetwood-mac,M,125116,say-you-love-me,1997,Rock,"Have mercy, baby on a poor girl like me,\nYou ...",en
3,3,fleetwood-mac,M,125117,sweet-girl,1997,Rock,"Written by stevie nicks.\nAnd he says, ""what d...",en
4,4,fleetwood-mac,M,125118,don-t-stop,1997,Rock,If you wake up and don't want to smile\nIf it ...,en
...,...,...,...,...,...,...,...,...,...
127291,127291,doyle-bramhall-ii,M,108707,new-faith,2016,Rock,So much life wasted in love and war\nNever kno...,en
127292,127292,emmi,F,233835,my-kinda-swag,2016,Rock,"[Intro]\nYou got, my my kind of swag\nI dig th...",en
127293,127293,emmi,F,233834,you-said-you-loved-me,2016,Rock,"Eh, eeeh eeh yo\nEh, eeeh eeh yo\nNot a chink ...",en
127294,127294,dessa,F,332044,congratulations,2016,Hip-Hop,[Intro]\nAlexander\n[Verse 1]\nCongratulations...,en


In [35]:
data1.song.describe()

count     141084
unique     97351
top        intro
freq          97
Name: song, dtype: object

In [4]:
#data = data[data['year'] != 1968] #let's remove thie one song from 1968

In [3]:
def text_process(text):
    text = text.lower()
    # remove newlines
    text = text.replace("\n", " ")
    # remove multiple whitespaces
    text = re.sub("\s\s+" , " ", text)
    # Remove det talte ord gælder
    #text = re.sub("^\s*Det talte ord gælder\\.*\s*", "", text)
    # remove starting whitespace
    text = re.sub("^\s*", "", text)

    return text

In [4]:
str(data['lyrics']).lower()

"0         you followed me to texas,\\nyou followed me to ...\n1         baby i'm yours\\nbaby i'm yours\\nand i'll be yo...\n2         raindrops keep falling on my head\\nand just li...\n3         i may not always love you\\nbut long as there a...\n4         there's a story told of a very gentle boy\\nand...\n                                ...                        \n119361    her high heels on the red carpet\\nher perfect ...\n119362    i always knew that you were hiding something f...\n119363    been a-sittin' 'round a coward, broken all tha...\n119364    he has one that locks the office,\\nand there's...\n119365    trap money good but the wrath only pilin'\\nhe ...\nname: lyrics, length: 119366, dtype: object"

In [5]:
def load_word_freqs(path):
    "Loads word frequency files and calculates lower-cased counts"
    paths = os.listdir(path)
    total_counter = Counter()
    
    for i in paths:
        if i.endswith(".csv"):
            with open(os.path.join(path, i), "r") as f:
                file_loaded = Counter(pd.read_csv(f))
            total_counter += file_loaded

    lowercase_counts = defaultdict(int)
    for k, v in total_counter.items():
        lowercase_counts[k.lower()] += v

    # freeze dict (i.e. make lowercase_counts a regular dict)
    lowercase_counts.default_factory = None

    return lowercase_counts

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,artist,gender,index,song,year,genre,lyrics,lang
1,1,fleetwood-mac,M,125115,temporary-one,1997,Rock,Written by christine mcvie and eddy quintela.\...,en
2,2,fleetwood-mac,M,125116,say-you-love-me,1997,Rock,"Have mercy, baby on a poor girl like me,\nYou ...",en
3,3,fleetwood-mac,M,125117,sweet-girl,1997,Rock,"Written by stevie nicks.\nAnd he says, ""what d...",en
4,4,fleetwood-mac,M,125118,don-t-stop,1997,Rock,If you wake up and don't want to smile\nIf it ...,en
5,5,fleetwood-mac,M,125119,my-little-demon,1997,Rock,"My little demon, comin'on down.\nMy little dem...",en


In [6]:
def absolute_entropy(lyrics, ent_dict, unk_ent):
    """Calculates mean absolute entropy. Sets entropy of unknown words 
    to unk_ent.
    Also returns proportion unknown words"""
    lyrics = lyrics.lower()
    lyrics = str(lyrics).split()   #tokenizing
    running_sum = 0
    n_unk = 0

    for lyric in lyrics:
        for word in lyric:
        # when using dict.get you can set a default value if the key does not exist
            running_sum += ent_dict.get(word, max_ent)
            if word not in ent_dict:
                n_unk += 1

    mean_ent = running_sum / len(lyrics)
    prop_unk = n_unk / len(lyrics)

    return (mean_ent, prop_unk)

In [8]:
if __name__ == '__main__':
    df = pd.read_csv("new_data.csv")

    #print("[INFO] running language detection...")
    # Mild preprocessing
    df['lyrics'] = df["lyrics"].map(text_process)

    print("[INFO] getting entropy measures...")
    ### Get entropy per speech
    path = "entropy"
    # load lowercased dictionary of word frequency counts
    counts = load_word_freqs(path)
    # calculate the sum of the counts
    total_sum = sum(counts.values())
    # Mapping to entropy dict using the formula for entropy (1 / p(w))
    ent_dict = dict({i: math.log(1/(counts.get(i)/total_sum)) for i in counts})
    # Setting unknown words to the maximum entropy 
    max_ent = max(ent_dict.values())

 
    df["ents"] = df['lyrics'].apply(lambda x : absolute_entropy(x, ent_dict, max_ent))
    # the absolute entropy returned a tuple (absolute_entropy, prop_unknown_words)
    # separating to two columns
    df[["absolute_entropy", "prop_unknown_words"]] = df["ents"].apply(pd.Series)


    df.to_csv(os.path.join("lyrics_entropy.csv"), index=False)


[INFO] getting entropy measures...


In [9]:
entropy = pd.read_csv('lyrics_entropy.csv')

In [10]:
entropy

Unnamed: 0.1,Unnamed: 0,song,year,artist,genre,lyrics,gender,ents,absolute_entropy,prop_unknown_words
0,1,my-elusive-dreams,1970,bobby-vinton,Pop,"you followed me to texas, you followed me to u...",M,"(2.884654434066845, 4.161676646706587)",2.884654,4.161677
1,2,baby-i-m-yours,1970,bobby-vinton,Pop,baby i'm yours baby i'm yours and i'll be your...,M,"(2.7579447677209155, 3.9788732394366195)",2.757945,3.978873
2,3,raindrops-keep-fallin-on-my-head,1970,bobby-vinton,Pop,raindrops keep falling on my head and just lik...,M,"(2.8810139795340794, 4.156424581005586)",2.881014,4.156425
3,4,god-only-knows,1970,beach-boys,Rock,i may not always love you but long as there ar...,M,"(2.633959286127798, 3.8)",2.633959,3.800000
4,5,their-hearts-were-full-of-spring,1970,beach-boys,Rock,there's a story told of a very gentle boy and ...,M,"(2.833748767583312, 4.088235294117647)",2.833749,4.088235
...,...,...,...,...,...,...,...,...,...,...
119361,5798,country-ain-t-never-been-pretty,2015,cam,Other,her high heels on the red carpet her perfect h...,F,"(3.0036377824263023, 4.333333333333333)",3.003638,4.333333
119362,5799,runaway-train,2015,cam,Other,i always knew that you were hiding something f...,F,"(2.910268641665984, 4.198630136986301)",2.910269,4.198630
119363,5800,want-it-all,2015,cam,Other,"been a-sittin' 'round a coward, broken all tha...",F,"(2.5920384087405535, 3.7395209580838324)",2.592038,3.739521
119364,5829,pocket-full-of-keys,2015,dale-ann-bradley,Country,"he has one that locks the office, and there's ...",F,"(2.8799345245388444, 4.154867256637168)",2.879935,4.154867
