In [104]:
#Lyric Sentiment Breakdown

In [105]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from stop_words import get_stop_words
nltk.download('wordnet')
import string
stop = get_stop_words('en')
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
import pandas as pd
import gensim
from gensim import corpora
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [106]:
DATA_PATH = '../data/'
fname = DATA_PATH + 'billboard_1970_current.csv'
df = pd.read_csv(fname,sep='@')

In [107]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    normalized_split = normalized.split()
    # deviding the lyric into verse 1 and verse 2
    return [normalized_split[:int(len(normalized_split)/2)],normalized_split[int(len(normalized_split)/2):]]

In [108]:
def corpusExtraction(lyric):
    tmp = clean(lyric)
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(tmp)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in tmp]

    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel

    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
    print(ldamodel.print_topics(num_topics=4, num_words=3))
    
# https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [109]:
def topicModelling(artist, title):
    lyric = df.loc[(df.artist == artist) & (df.title == title),['lyric']].values[0][0]
    corpusExtraction(lyric)

In [110]:
topicModelling('Michael Jackson', 'Man In The Mirror')

[(0, '0.072*"na" + 0.062*"change" + 0.054*"make"'), (1, '0.009*"widow" + 0.009*"that" + 0.009*"blind"'), (2, '0.009*"one" + 0.009*"street" + 0.009*"widow"')]


In [111]:
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)', 'heart' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]

In [112]:
def word_feats(words):
    return dict([(word, True) for word in words])

def sentimentAnalysisHelper(lyric):
    # train positive and negative features
    positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
    negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
    neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
    
    train_set = negative_features + positive_features + neutral_features
    classifier = NaiveBayesClassifier.train(train_set)

    # setting up lyric materials
    lyric = lyric.lower()
    words = lyric.split(' ')
    
    # Sentiment analysis
    neg = 0
    pos = 0
    for word in words:
        classResult = classifier.classify( word_feats(word))
        if classResult == 'neg':
            neg = neg + 1
        if classResult == 'pos':
            pos = pos + 1

    print('Positive: ' + str(round(float(pos)/len(words)*100,2)) + "%")
    print('Negative: ' + str(round(float(neg)/len(words)*100,2)) + "%")

In [113]:
def sentimentAnalysis(artist, title):
    lyric = df.loc[(df.artist == artist) & (df.title == title),['lyric']].values[0][0]
    sentimentAnalysisHelper(lyric)

In [114]:
sentimentAnalysis('Michael Jackson', 'Man In The Mirror')

Positive: 69.18%
Negative: 7.06%


In [115]:
topicModelling('Michael Jackson', 'Man In The Mirror')

[(0, '0.072*"na" + 0.062*"change" + 0.054*"make"'), (1, '0.009*"somebody" + 0.009*"disregard" + 0.009*"deeply"'), (2, '0.009*"home" + 0.009*"soul" + 0.009*"somebody"')]
