In [167]:
import nltk, re, pprint
import pandas as pd
import numpy as np

In [168]:
corpus = nltk.corpus.nps_chat

corpus.ensure_loaded()
posts = corpus.xml_posts()

#Training to establish dialogue acts:

d_acts = []

for p in posts:
    if p.get('class') not in d_acts:
            d_acts.append(p.get('class'))
sorted(d_acts)

#### Text Preprocessing:

Import the dataset:

In [169]:
data = pd.read_csv("D:\ML\Datasets\labeled_lyrics_cleaned.csv")

In [170]:
data.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.63
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.54
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.37


Rename columns:

In [171]:
#data = data.round(decimals=2)
#data.drop(labels="Unnamed: 0", axis=1, inplace=True)
data.rename(columns={"Unnamed: 0": "index", "seq": "lyrics", "label": "valency"}, inplace=True)

In [172]:
data.head()

Unnamed: 0,index,artist,lyrics,song,valency
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.63
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.54
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.37


Check summary statistics:

In [173]:
data.describe(include='all')

Unnamed: 0,index,artist,lyrics,song,valency
count,158353.0,158353,158353,158353,158353.0
unique,,14691,135991,99031,
top,,Elvis Presley,"Somewhere over the rainbow, way up high\r\nThe...",Have Yourself a Merry Little Christmas,
freq,,821,167,162,
mean,79176.0,,,,0.49
std,45712.72,,,,0.25
min,0.0,,,,0.0
25%,39588.0,,,,0.29
50%,79176.0,,,,0.48
75%,118764.0,,,,0.69


Remove cover songs and format decimal places for summary statistics display:

In [174]:
#data = data.drop_duplicates(subset=['lyrics', 'song'])
data.sort_values(by=['song', 'valency'], ascending=False, inplace=True) # to keep highest value valency 
data = data.drop_duplicates(subset='lyrics')
pd.set_option('display.float_format', lambda x: '%.2f' % x) # round everything to 2 decimal places

In [175]:
data.describe(include='all')

Unnamed: 0,index,artist,lyrics,song,valency
count,135991.0,135991,135991,135991,135991.0
unique,,10777,135991,95714,
top,,Elvis Presley,Something's happened what's gone wrong\r\nGoin...,Intro,
freq,,753,1,127,
mean,79291.03,,,,0.5
std,45624.37,,,,0.25
min,0.0,,,,0.0
25%,39957.5,,,,0.3
50%,79445.0,,,,0.5
75%,118768.5,,,,0.7


Spot-checking three random entries to confirm data integrity:

In [176]:
data = data.reset_index() 
data

Unnamed: 0,level_0,index,artist,lyrics,song,valency
0,54911,54911,Simon & Milo,"Hello, this is Stacy, the computer\nGood morni...",www.nevergetoveryou,0.68
1,82479,82479,Hippo Campus,See how the western kids\r\nHave silicon insid...,western kids,0.52
2,82478,82478,Hippo Campus,"Wisconsin pines, collaborating with the day gl...",way it goes,0.52
3,82477,82477,Hippo Campus,"I see meaning where you don't, where you don't...",vines,0.66
4,82476,82476,Hippo Campus,My thoughts are a battlefield of sub-surreal a...,vacation,0.55
...,...,...,...,...,...,...
135986,109667,109667,The Beach Boys,"Hi, this is Al this scene takes place at a typ...","""Cassius"" Love Vs. ""Sonny"" Wilson",0.49
135987,55096,55096,Simple Minds,"Cry cry cry\r\nCry like a baby\r\n""see"" Moon ""...","""C"" Moon Cry Like a Baby",0.77
135988,41838,41838,The Blues Brothers,Caught a ride into South Dakota\r\nWith two gi...,"""B"" Movie Box Car Blues",0.50
135989,81217,81217,The Gaslight Anthem,Have you seen my hands?\nJust look at 'em shak...,"""45""",0.42


### Attempting sentiment classification using Vader:

In [177]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

Loading a song lyrics:

In [236]:
lyrics = data.at[202, 'lyrics'].strip()
#text = tokenize.sent_tokenize(lyrics)[0]
print(lyrics)

Tonight I hold to nothing 
but the feelings in my soul
My heart overflows with emotion 
I just can't control
There's someone you are going to 
But tonight 
I just can't tell right from wrong

Let me be yours until tomorrow 
Let me be yours until tomorrow
Give me one night of your life
Just be mine 'til the dawn

Tomorrow the real world 
Will come crashing down on me
I know I must lose you and that's the way it has to be
But tonight I see no boundries
So I beg you before my chance is gone

Let me be yours until tomorrow 
Let me be yours until tomorrow
Give me one night of your life
So I can go on.

Till the dawn's intrusion 
Ends this sweet illusion
Make believe you're mine, love
This is not goodbye, love,
All I ask is one night, till the morning sunlight
Won't you stay here in my arms
And baby, let me be yours until tomorrow
Let me be yours until tomorrow.


#### Analysing valency and extracting compund score:

In [253]:
def valency(text):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    #ss.pop('compound')
    compound_score = ss.get('compound')
    if compound_score > 0.3 and compound_score <= 1:
        valency = 'positive'
    elif compound_score >= -1 and compound_score < -0.3:
        valency = 'negative'
    else:
        valency = 'neutral'
    return valency, compound_score
    
valency(lyrics)

('positive', 0.8847)

User situtation test:

In [255]:
s = "Today is finally my day off! The weather is amazing and I'm going to the beach"
valency(s)

('positive', 0.6239)

If max score required instead:

In [None]:
#max_value = max(ss.values())
#max_value
#max_key = [k for k, v in ss.items() if v == max_value][0]
#max_key

### Attempting to assign one of the pre-defined categories to lyrics: