In [2]:
import string
import syllapy
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import defaultdict
import pandas as pd
import os

sw = stopwords.words('english')
sentiment_analyzer = SentimentIntensityAnalyzer()

duration_file = open('song_durations.txt','r',encoding='utf-8')
duration_dict = {}
next(duration_file)
for line in duration_file:
    song_id, duration = line.split()
    song_id = int(song_id)
    duration = float(duration)
    if duration != 0:
        duration_dict[song_id]=duration
    else:
        duration_dict[song_id]=np.nan
    


In [3]:
def lyric_features(song,sw,sentiment_analyzer,song_id, durations):
    feature_dict={} #store our features in a dictionary
    
    duration = durations[song_id]
    
    song = song.lower().translate(str.maketrans('', '', string.punctuation))
    lines = song.split('\n')#split by line
    lines = [line for line in lines if line != ''] #remove blank lines
    number_lines=len(lines)
    if number_lines==0:
        number_lines=np.nan
    
    syllables = [syllapy.count(line) for line in lines] #syllable estimate by line
    syllables_per_line = sum(syllables)/number_lines
    syllables_variance = np.var(syllables)
    
    tokens = [line.split() for line in lines] #split line into tokens
    tokens_per_line = [len(line) for line in tokens]
    tokens=[token for line in tokens for token in line] #flatten our token list
    
    unique_tokens = set([token for token in tokens if token not in sw])
    number_tokens = len(tokens)
    if number_tokens==0:
        number_tokens=np.nan
    lexical_diversity = len(unique_tokens)/number_tokens
    
    sentiment = sentiment_analyzer.polarity_scores(song) #sentiment analysis
    sentiment = sentiment['compound']
    
    tokens_per_second = len(tokens)/duration
    syllables_per_second = sum(syllables)/duration
    
    #construct dictionary
    feature_dict['duration']=duration
    feature_dict['number_lines']=number_lines
    feature_dict['syllables_per_line']=syllables_per_line
    feature_dict['syllables_variance']=syllables_variance
    feature_dict['number_tokens']=number_tokens
    feature_dict['unique_tokens'] = len(unique_tokens)
    feature_dict['lexical_diversity']=lexical_diversity
    feature_dict['sentiment']=sentiment
    feature_dict['tokens_per_second']=tokens_per_second
    feature_dict['syllables_per_second']=syllables_per_second
    return feature_dict
    
    

In [9]:
directory = r'Lyrics\\'

song_dict={}
n=0
for filename in os.listdir(directory):
    song_id = int(filename.split('_')[0])
    song=open('Lyrics\\'+filename,'r',encoding='utf-8').read()
    song_features = lyric_features(song,sw,sentiment_analyzer,song_id,duration_dict)
    song_dict[song_id]=song_features
    
    if n%5000==0:
        print(str(n)+' songs completed')
        
    n+=1

0 songs completed


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


5000 songs completed
10000 songs completed
15000 songs completed
20000 songs completed
25000 songs completed
30000 songs completed
35000 songs completed
40000 songs completed
45000 songs completed
50000 songs completed
55000 songs completed
60000 songs completed
65000 songs completed
70000 songs completed
75000 songs completed
80000 songs completed
85000 songs completed
90000 songs completed
95000 songs completed
100000 songs completed
105000 songs completed
110000 songs completed


In [17]:
feature_df = pd.DataFrame.from_dict(song_dict, orient='index')

feature_df.to_csv('song_features.csv') #make sure to use index_col =0 when reading in file