In [1]:
import string
import glob
import re
import nltk
import random
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score



In [2]:
stop_words = set(stopwords.words('english'))
port = PorterStemmer()

In [3]:
# Dataset file lists, 4 total, one for each emotion

# Train Data Ingestion
happy_filelist = glob.glob('./data/Happy/Train/happy*.txt')
angry_filelist = glob.glob('./data/Angry/Train/angry*.txt')
relaxed_filelist = glob.glob('./data/Relaxed/Train/relaxed*.txt')
sad_filelist = glob.glob('./data/Sad/Train/sad*.txt')

# Test Data Ingestion
happy_filelist2 = glob.glob('./data/Happy/Test/happy*.txt')
angry_filelist2 = glob.glob('./data/Angry/Test/angry*.txt')
relaxed_filelist2 = glob.glob('./data/Relaxed/Test/relaxed*.txt')
sad_filelist2 = glob.glob('./data/Sad/Test/sad*.txt')

# Combine Train and Test Data For the Lexicon-Based Analysis
happy_filelist = happy_filelist + happy_filelist2
angry_filelist = angry_filelist + angry_filelist2
relaxed_filelist = relaxed_filelist + relaxed_filelist2
sad_filelist = sad_filelist + sad_filelist2

In [4]:
# read() function returns:
# lyrics(text tokenized, and lowercased without stopwords per song) and
# all_words(the lyrics output of all the songs as one list)
# based on the input of a file list and emotion tag

def read(filelist, tag):
    lyrics = []
    all_words = []
    
    for f in filelist:
        try:
            with open(f,'r') as file:
                song = file.read()
                file.close()
                #added
                #song = song.decode("utf-8-sig")
                song = re.sub(r"(\\n|\\u....|\t)", "", song)
                song = re.sub(r"(\[\d\d:\d\d\.\d\d\])","",song)
                song = song.lower()
                #song = nltk.word_tokenize(song)
                song = nltk.word_tokenize(song.translate(str.maketrans('','',string.punctuation)))
                song = [w for w in song if not w in string.punctuation]
                song = [w for w in song if not w in stop_words]
                song = [w for w in song if w != "'"]
                #song = [w for w in song if not "\ufeff" in w]
                #song = [w for w in song if w != "'"]
                song_tag = (song, tag)
                lyrics.append(song_tag)
                
                for word in song:
                    all_words.append(word)
        except:
            break
    return lyrics, all_words

In [5]:
# This function finds the features for a song

def find_features(song):
    words = set(song)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [6]:
# Create lyrics list for the 4 emotions seperately, 
# the entire lyric database, and all_words(a single list of all lyric tokens)

happy_lyrics, happy_words = read(happy_filelist, 'happy')
angry_lyrics, angry_words = read(angry_filelist, 'angry')
relaxed_lyrics, relaxed_words = read(relaxed_filelist, 'relaxed')
sad_lyrics, sad_words = read(sad_filelist, 'sad')

data = happy_lyrics + angry_lyrics + relaxed_lyrics + sad_lyrics
all_words = happy_words + angry_words + relaxed_words + sad_words

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:100] #i changed this to 25 from 100

In [7]:
random.shuffle(data)

featuresets = [(find_features(song), tag) for (song, tag) in data]

train_set = featuresets[:680]
test_set = featuresets[681:]

model = nltk.NaiveBayesClassifier.train(train_set)
                                    
print("Classifier accuracy percent:",(nltk.classify.accuracy(model, test_set))*100)

Classifier accuracy percent: 41.23711340206185


In [8]:
#                              #
#                              #
#         Lexicon-Based        #
#      Sentiment Analysis      #
#                              #
#                              #


# Classifies the emotion of song lyrics based on the similarity of lyrics
# and that class's lexicon.  The lexicon is created with the seed word of 
# the class and is increased in size with synonyms.  Each word in a given
# song lyric is compared to each emotion synonym in the class and the sum
# is taken.  This sum of emotion similarity is calculated for all 4 emotions
# for each test data song.  The maximum of the 4 is chosen as the class
# label for that song. Uses wordnet to build the lexicons.

In [9]:
# wordnet
from nltk.corpus import wordnet as wn

In [10]:
# Find synonyms and antonyms of the four emotions (in bag of words format)

# Relaxed
synonymsR = []
antonymsR = []

for syn in wn.synsets("relaxed"):
    for l in syn.lemmas():
        synonymsR.append(l.name())
        if l.antonyms():
            antonymsR.append(l.antonyms()[0].name())

#print(set(synonymsR))
#print(set(antonymsR))


# Happy
synonymsH = []
antonymsH = []

for syn in wn.synsets("happy"):
    for l in syn.lemmas():
        synonymsH.append(l.name())
        if l.antonyms():
            antonymsH.append(l.antonyms()[0].name())


# Happy: adding "pleasant" to increase the lexicon
for syn in wn.synsets("pleasant"):
    for l in syn.lemmas():
        synonymsH.append(l.name())

# Happy: adding "laugh" to increase the lexicon size
for syn in wn.synsets("laugh"):
    for l in syn.lemmas():
        synonymsH.append(l.name())
        
# Sad
synonymsS = []
antonymsS = []

for syn in wn.synsets("sad"):
    for l in syn.lemmas():
        synonymsS.append(l.name())
        if l.antonyms():
            antonymsS.append(l.antonyms()[0].name())

#print(set(synonymsS))
#print(set(antonymsS))


# Angry
synonymsA = []
antonymsA = []

for syn in wn.synsets("angry"):
    for l in syn.lemmas():
        synonymsA.append(l.name())
        if l.antonyms():
            antonymsA.append(l.antonyms()[0].name())

# Angry: adding "hate" to increase the lexicon
for syn in wn.synsets("hate"):
    for l in syn.lemmas():
        synonymsA.append(l.name())

#print(set(synonymsA))
#print(set(antonymsA))

In [24]:
#synonymsA[0] # gives the 1st element in the list

# Get unique values only
synonymsA = set(synonymsA) 
synonymsR = set(synonymsR)
synonymsH = set(synonymsH)
synonymsS = set(synonymsS)

# Convert sets back to lists
synonymsA = list(synonymsA) 
synonymsR = list(synonymsR)
synonymsH = list(synonymsH)
synonymsS = list(synonymsS)

#print(synonymsH)
#j = len(synonymsH) # need to loop through synonyms from 1 to j for Angry
#print(j)

#synonymsH[0]
print(synonymsH)
print(synonymsR)
print(synonymsS)
print(synonymsA)

#sum = 2
#sum/len(synonymsH)

['gag', 'express_mirth', 'jest', 'well-chosen', 'glad', 'laughter', 'felicitous', 'jape', 'happy', 'laugh', 'express_joy', 'joke', 'pleasant']
['slack_up', 'unstrain', 'unwind', 'slack', 'loosen_up', 'slacken', 'unlax', 'relax', 'unbend', 'loosen', 'loose', 'relaxed', 'slow_down', 'make_relaxed', 'decompress']
['lamentable', 'pitiful', 'sorry', 'distressing', 'sad', 'deplorable']
['furious', 'hatred', 'angry', 'wild', 'hate', 'tempestuous', 'detest', 'raging']


In [25]:
#                              #
#                              #
#        ENTER TEST DATA       #
#                              #
#                              #

#testdata = test_setRelaxed
#testdata = alldocs
testdata = data
testdata[1][1]

'happy'

In [26]:
# create a dataframe to store the test data's emotions similarity sum 
# scores for each of the four emotions. Rows = songs and 4 cols = emotion, one is for max class label 
# similarity sums. df#rows = #rows in the test data.
import pandas, numpy

#currently, the test input is test_setHappy
#df1 = pandas.DataFrame(0, index=np.arange(len(test_setHappy)), columns = ['Happy', 'Relaxed', 'Sad', 'Angry', 'MaxClass', 'ActualClass'])
df1 = pandas.DataFrame(index=np.arange(len(testdata)), columns = ['happy', 'relaxed', 'sad', 'angry', 'MaxClass', 'ActualClass'])
#df1.iloc[0,0] = 2  #19 rows: 0-18, cols: 0-3, 0=happy 1=relaxed 2=sad 3=angry, 4=predictedclass 5=actualclass
#df1

# to find the class of the song, we take the max of the row

In [27]:
# this is the class label: testdata[songIndex][1]
# need to add this to the matrix for each index of testdata[index][1] at df1[index, 5]
  
# add "actual" class labels to the matrix for each song
for songIndex in range(len(testdata)):
    df1.iloc[songIndex, 5] = testdata[songIndex][1]

#df1

In [28]:
# Compare each lyric word in a song to every word in the Synonym list 
#  for each emotion

# HAPPY COLUMN CALCULATIONS

#this loop will assess every song
songind = 0  #j will be used as the row value for inputting sim sums into the df

x = len(testdata)  #number of items in the test set

#loop through every song:
for i in range(x):
    #songind+=1
    firstsong = testdata[songind][0]

    sum = 0  # initialize the sum value, per song, per emotion. If the simw1w2 is not 0, i.e. not "None", we want to keep track of it

    for w in firstsong:
        #if w1 is not null, then we do the rest of this cell....
        if wn.synsets(w):
            w1 = wn.synsets(w)[0]
            #print('w1')
            #print(w1)
            # For each of the synonyms of Happy, if similarity b/t w1 and w2 is 
            for j in range(len(synonymsH)):
                w2 = synonymsH[j]
                w2 = wn.synsets(w2)[0]
                #print('w2')
                #print(w2)
                w1w2sim = w1.wup_similarity(w2)
                #print(w1w2sim)
                if w1w2sim != None:
                    if w1w2sim > 0.7:
                        sum += 1
                        sum = sum / len(synonymsH) #normalize the sum by dividing by the number of synonymns
                        sum = sum / len(firstsong) #normalize the sum by dividing by the number of words in the song
                        #print('sum')
                        #print(sum)   # this is the sum for the first song's happy similarities
    df1.iloc[songind,0] = sum
    songind += 1


In [29]:
# RELAXED CALCULATIONS

#this loop will assess every song
songind = 0  #j will be used as the row value for inputting sim sums into the df

for i in range(x):
    #songind+=1
    firstsong = testdata[songind][0]

    sum = 0  # initialize the sum value, per song, per emotion. If the simw1w2 is not 0, i.e. not "None", we want to keep track of it

    for w in firstsong:
        #if w1 is not null, then we do the rest of this cell....
        if wn.synsets(w):
            w1 = wn.synsets(w)[0]
            #print('w1')
            #print(w1)
            # For each of the synonyms of Relaxed, if similarity b/t w1 and w2 is 
            for j in range(len(synonymsR)):
                w2 = synonymsR[j]
                w2 = wn.synsets(w2)[0]
                #print('w2')
                #print(w2)
                w1w2sim = w1.wup_similarity(w2)
                #print(w1w2sim)
                if w1w2sim != None:
                    if w1w2sim > 0.7:
                        sum += 1
                        sum = sum / len(synonymsR) #normalize the sum by dividing by the number of synonymns
                        sum = sum / len(firstsong) #normalize the sum by dividing by the number of words in the song
                        #print('sum')
                        #print(sum)   # this is the sum for the first song's Relaxed similarities
    df1.iloc[songind,1] = sum
    songind += 1

In [30]:
# SAD CALCULATIONS

#this loop will assess every song
songind = 0  #j will be used as the row value for inputting sim sums into the df

for i in range(x):
    #songind+=1
    firstsong = testdata[songind][0]

    sum = 0  # initialize the sum value, per song, per emotion. If the simw1w2 is not 0, i.e. not "None", we want to keep track of it

    for w in firstsong:
        #if w1 is not null, then we do the rest of this cell....
        if wn.synsets(w):
            w1 = wn.synsets(w)[0]
            #print('w1')
            #print(w1)
            # For each of the synonyms of Sad, if similarity b/t w1 and w2 is 
            for j in range(len(synonymsS)):
                w2 = synonymsS[j]
                w2 = wn.synsets(w2)[0]
                #print('w2')
                #print(w2)
                w1w2sim = w1.wup_similarity(w2)
                #print(w1w2sim)
                if w1w2sim != None:
                    if w1w2sim > 0.7:
                        sum += 1
                        sum = sum / len(synonymsS) #normalize the sum by dividing by the number of synonymns
                        sum = sum / len(firstsong) #normalize the sum by dividing by the number of words in the song
                        #print('sum')
                        #print(sum)   # this is the sum for the first song's Sad similarities
    df1.iloc[songind,2] = sum
    songind += 1

In [31]:
# ANGRY CALCULATIONS

#this loop will assess every song
songind = 0  #j will be used as the row value for inputting sim sums into the df

x = len(testdata)
for i in range(x):
    #songind+=1
    firstsong = testdata[songind][0]

    sum = 0  # initialize the sum value, per song, per emotion. If the simw1w2 is not 0, i.e. not "None", we want to keep track of it

    for w in firstsong:
        #if w1 is not null, then we do the rest of this cell....
        if wn.synsets(w):
            w1 = wn.synsets(w)[0]
            #print('w1')
            #print(w1)
            # For each of the synonyms of Angry, if similarity b/t w1 and w2 is 
            for j in range(len(synonymsA)):
                w2 = synonymsA[j]
                w2 = wn.synsets(w2)[0]
                #print('w2')
                #print(w2)
                w1w2sim = w1.wup_similarity(w2)
                #print(w1w2sim)
                if w1w2sim != None:
                    if w1w2sim > 0.7:
                        sum += 1
                        sum = sum / len(synonymsA) #normalize the sum by dividing by the number of synonymns
                        sum = sum / len(firstsong) #normalize the sum by dividing by the number of words in the song
                        #print('sum')
                        #print(sum)   # this is the sum for the first song's Angry similarities
    df1.iloc[songind,3] = sum
    songind += 1

In [32]:
# Find the max similarity value of each row and assign the class of that
# value as the Maximum Class Output class label

indx = 0

for i in range(x):
    #print(df1.iloc[indx].argmax())
    df1.iloc[indx, 4] = df1.iloc[indx, 0:3].argmax()
    #df1.iloc[indx, 4] = df1.iloc[indx].argmax()
    
    indx += 1

In [33]:
df1  #This is final collection of calculations and predictions

Unnamed: 0,happy,relaxed,sad,angry,MaxClass,ActualClass
0,0,0,0,0.000571102,happy,happy
1,0,0,0,0.000806452,happy,happy
2,0,0,0,0.000498008,happy,happy
3,0.000846024,0,0,0.00137552,happy,sad
4,0.000460617,0,0,0.000749064,happy,sad
5,0,0,0,0.000919118,happy,happy
6,0.000740192,0,0,0.00120337,happy,relaxed
7,0.000474834,0.000411523,0,0.000772201,happy,happy
8,0.000484027,0.000419463,0,0.000786782,happy,happy
9,0,0,0,0.00186915,happy,relaxed


In [34]:
# Lexicon Evaluation

from nltk.metrics import precision, recall, accuracy, ConfusionMatrix

preds = list(df1.iloc[:, 4])    #predicted class labels to feed into the confusion matrix
actuals = list(df1.iloc[:, 5])  #actual class labels

#print(list(preds))
print(ConfusionMatrix(actuals, preds))

        |           r     |
        |           e     |
        |   a   h   l     |
        |   n   a   a     |
        |   g   p   x   s |
        |   r   p   e   a |
        |   y   y   d   d |
--------+-----------------+
  angry |  <.>140  12  20 |
  happy |   .<181> 13  12 |
relaxed |   . 181  <9> 11 |
    sad |   . 162  14 <23>|
--------+-----------------+
(row = reference; col = test)



In [67]:
# DEPRECATED
# Create train and test sets for each emotion

random.shuffle(happy_lyrics)
train_setHappy = happy_lyrics[:80]
test_setHappy = happy_lyrics[81:]

random.shuffle(angry_lyrics)
train_setAngry = angry_lyrics[:80]
test_setAngry = angry_lyrics[81:]

random.shuffle(relaxed_lyrics)
train_setRelaxed = relaxed_lyrics[:80]
test_setRelaxed = relaxed_lyrics[81:]

random.shuffle(sad_lyrics)
train_setSad = sad_lyrics[:80]
test_setSad = sad_lyrics[81:]

training_docs = train_setHappy + train_setAngry + train_setRelaxed + train_setSad
testing_docs = test_setHappy + test_setAngry + test_setRelaxed + test_setSad

alldocs = training_docs + testing_docs
#print(len(train_setHappy))