In [67]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import nltk
import re
import spacy
from gensim import corpora, models
import gensim
from gensim.matutils import hellinger
from scipy.spatial.distance import cosine
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv(r'C:\Users\trevo\NLP\data\lyrics-data.csv')

In [4]:
df.head(5)

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt


In [5]:
df_en = df[df['language']=='en']

In [6]:
df_en.head(5)

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en


In [8]:
df_en.shape[0]

191814

In [9]:
print(df_en['Lyric'].iloc[2])

Baby, let's cruise, away from here
Don't be confused, the way is clear
And if you want it you got it forever
This is not a one night stand, baby

Let the music take your mind
Just release & you will find
You're gonna fly away
Glad you're goin' my way
I love it when we're cruisin' together
Music is played for love
Cruisin' is made for love
I love it when we're cruisin' together

Baby, tonight belongs to us
Everything's right, do what you must
And inch by inch we get closer & closer

To every little part of each other, ooh, baby, yeah, so

Let the music take your mind
Just release & you will find
You're gonna fly away
Glad you're going my way
I love it when we're cruisin' together
Music is played for love
Cruisin' is made for love
I love it when we're cruisin' together

You're gonna fly away

Cruise with me, baby


In [10]:
# Download the stopwords library
nltk.download('stopwords')

# Establish a word punctuation tokenizer
wpt = nltk.WordPunctTokenizer()

# Establish the English stop words
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # Lowercase and remove special characters and whitespaces
    doc = re.sub(r"[^a-zA-Z\s']", '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()


    # Tokenize document
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    # Re-create the document from filtered tokens
    doc = ' '.join(filtered_tokens)

    doc = re.sub(r"'\s*", "", doc)
    return doc

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(df_en['Lyric'])

In [12]:
print(norm_corpus[2])

baby let cruise away confused way clear want got forever one night stand baby let music take mind release find gonna fly away glad goin way love cruisin together music played love cruisin made love love cruisin together baby tonight belongs us everything right must inch inch get closer closer every little part ooh baby yeah let music take mind release find gonna fly away glad going way love cruisin together music played love cruisin made love love cruisin together gonna fly away cruise baby


In [14]:
doc = nlp(norm_corpus[2].item())

for token in doc:
    print(token.text, token.lemma_)

baby baby
let let
cruise cruise
away away
confused confused
way way
clear clear
want want
got get
forever forever
one one
night night
stand stand
baby baby
let let
music music
take take
mind mind
release release
find find
gon gon
na na
fly fly
away away
glad glad
goin goin
way way
love love
cruisin cruisin
together together
music music
played play
love love
cruisin cruisin
made make
love love
love love
cruisin cruisin
together together
baby baby
tonight tonight
belongs belong
us we
everything everything
right right
must must
inch inch
inch inch
get get
closer close
closer close
every every
little little
part part
ooh ooh
baby baby
yeah yeah
let let
music music
take take
mind mind
release release
find find
gon gon
na na
fly fly
away away
glad glad
going go
way way
love love
cruisin cruisin
together together
music music
played play
love love
cruisin cruisin
made make
love love
love love
cruisin cruisin
together together
gon gon
na na
fly fly
away away
cruise cruise
baby baby


In [16]:
lemmatized_corpus = []

for text in norm_corpus[0:500]:
    doc = nlp(text.item())  
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    lemmatized_corpus.append(lemmatized_text)

In [18]:
print(lemmatized_corpus[2])

baby let cruise away confused way clear want get forever one night stand baby let music take mind release find gon na fly away glad goin way love cruisin together music play love cruisin make love love cruisin together baby tonight belong we everything right must inch inch get close close every little part ooh baby yeah let music take mind release find gon na fly away glad go way love cruisin together music play love cruisin make love love cruisin together gon na fly away cruise baby


In [53]:
tokenized_corpus = [text.split() for text in lemmatized_corpus]

dictionary = corpora.Dictionary(tokenized_corpus)
corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]

lda_model = gensim.models.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=35, random_state = 42)

for topic_id in range(20):
    topic = lda_model.print_topic(topic_id)
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.048*"work" + 0.047*"like" + 0.034*"halo" + 0.030*"girl" + 0.030*"diamond" + 0.025*"run" + 0.023*"shine" + 0.022*"bright" + 0.020*"get" + 0.020*"see"
Topic 1: 0.022*"diva" + 0.022*"think" + 0.021*"get" + 0.021*"feel" + 0.017*"turn" + 0.017*"like" + 0.016*"stand" + 0.015*"love" + 0.015*"make" + 0.014*"man"
Topic 2: 0.033*"top" + 0.026*"get" + 0.024*"love" + 0.024*"like" + 0.021*"put" + 0.019*"right" + 0.016*"one" + 0.016*"baby" + 0.015*"ooh" + 0.011*"see"
Topic 3: 0.037*"bad" + 0.025*"oh" + 0.024*"daddy" + 0.020*"say" + 0.020*"get" + 0.019*"one" + 0.016*"like" + 0.016*"shoot" + 0.016*"bum" + 0.013*"girl"
Topic 4: 0.040*"know" + 0.020*"love" + 0.019*"get" + 0.019*"wait" + 0.019*"thing" + 0.017*"never" + 0.017*"time" + 0.014*"want" + 0.014*"think" + 0.014*"go"
Topic 5: 0.030*"na" + 0.026*"love" + 0.021*"time" + 0.021*"like" + 0.019*"feel" + 0.018*"way" + 0.018*"get" + 0.016*"baby" + 0.015*"know" + 0.015*"say"
Topic 6: 0.097*"oh" + 0.048*"get" + 0.031*"go" + 0.022*"uh" + 0.017*"k

In [54]:
song_topic_distribution = [lda_model[doc] for doc in corpus]

In [62]:
song_topic_distribution[496]

[(0, 0.99547607)]

In [63]:
song_topic_distribution[497]

[(0, 0.9966545)]

In [65]:
song_0_dist = song_topic_distribution[496]
song_1_dist = song_topic_distribution[497]

hellinger_distance = hellinger(song_0_dist, song_1_dist)

print(hellinger_distance)

0.0004174645345004054


In [41]:
print(df_en['Lyric'].iloc[0])

I feel so unsure
As I take your hand and lead you to the dance floor
As the music dies, something in your eyes
Calls to mind a silver screen
And all those sad goodbyes

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste the chance that I've been given
So I'm never gonna dance again
The way I danced with you

Time can never mend
The careless whispers of a good friend
To the heart and mind
Ignorance is kind
There's no comfort in the truth
Pain is all you'll find

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste this chance that I've been given
So I'm never gonna dance again
The way I danced with you

Never without your love

Tonight the music seems so loud
I wish that we could lose this crowd
Maybe it's better this way
We'd hurt each other with the thing

In [42]:
print(df_en['Lyric'].iloc[1])

Don't let them fool, ya
Or even try to school, ya! Oh, no!
We've got a mind of our own
So go to hell if what you? re thinking is not right!
Love would never leave us alone
A-yin the darkness there must come out to light

Could you be loved and be loved?
Could you be loved and be loved?

Don't let them change ya, oh!
Or even rearrange ya! Oh, no!
We've got a life to live
They say: only, only
Only the fittest of the fittest shall survive
Stay alive! Oh!

Could you be loved and be loved?
Could you be loved, wo now! And be loved?

Could you be
Could you be
Could you be loved?

Say something!

Se ligue na ternura
Se ligue no amor
Se ligue na ternura
Se ligue na cor
Se ligue na alegria
Se ligue no prazer
Se ligue, fique atento, se ligue, fique astral

Could you be loved and be loved?


In [60]:
print(df_en['Lyric'].iloc[497])

Shine bright like a diamond
(C'mon)
Shine bright like a diamond
(Yeah)

You glisten so beautiful, priceless
Listen to me, I need you to know
How you can change my whole world
New life, new love, I'm a new girl
Guess you did what they coudn't
Stayed when I pushed away and they wouldn't
Said "trust" and I thought I shouldn't
Carved your initials in my heart like wood and
Now I'm floating through air
Brightest in the whole sky, we both there
Freedom in the universe, now is so clear
I waited a lifetime
Now you right here
And I'm never gonna give it up
Shinning like a million spot lights, live it up
Blinding, diamonds, forever that's us
And I can never get enough
Yep

Find light in the beautiful sea
I choose to be happy
You and I, you and I
We're like diamonds in the sky

You're a shooting star I see
A vision of ecstasy
When you hold me
I'm alive
We're like diamonds in the sky

I knew that we'd become one right away
Oh, right away
At first sight I felt the energy of sun rays
I saw the life 

In [61]:
print(df_en['Lyric'].iloc[496])

Shine bright like a diamond
Shine bright like a diamond

Find light in the beautiful sea
I choose to be happy
You and I, you and I
We're like diamonds in the sky

You're a shooting star I see
A vision of ecstasy
When you hold me, I'm alive
We're like diamonds in the sky

I knew that we'd become one right away
Oh, right away
At first sight, I felt the energy of sunrays
I saw the life inside your eyes

So shine bright, tonight, you and I
We're beautiful, like diamonds in the sky
Eye to eye, so alive
We're beautiful, like diamonds in the sky

Shine bright like a diamond
Shine bright like a diamond
Shine bright like a diamond
We're beautiful, like diamonds in the sky

Shine bright like a diamond
Shine bright like a diamond
Shine bright like a diamond
We're beautiful, like diamonds in the sky

Palms rise to the universe
As we moonshine and molly
Feel the warmth, we'll never die
We're like diamonds in the sky

You're a shooting star I see
A vision of ecstasy
When you hold me, I'm alive
We're

In [68]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_score(lyrics):
    sentiment = analyzer.polarity_scores(lyrics)
    return sentiment['compound']

sentiment1 = get_sentiment_score(lemmatized_corpus[496])
sentiment2 = get_sentiment_score(lemmatized_corpus[497])

tfidf_vectorizer = TfidfVectorizer()
lyrics_matrix = tfidf_vectorizer.fit_transform([lemmatized_corpus[496], lemmatized_corpus[497]])
lyrics_similarity = cosine_similarity(lyrics_matrix)

print("Sentiment Score for Song 1:", sentiment1)
print("Sentiment Score for Song 2:", sentiment2)
print("Cosine Similarity between Lyrics:", lyrics_similarity[0][1])

Sentiment Score for Song 1: 0.9998
Sentiment Score for Song 2: 0.9999
Cosine Similarity between Lyrics: 0.9865365982984073


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
## Preprocessing
# Tokenize
# Lowercase
# punctuation removal
# number removal
# Stop word removal

## Additional Preprocessing
# Lemmatization and or stemming
# Analysis word frequency determine if we need to remove high frequency low value words
# Stop word removal

## Data Check
# Explore word stats
# Everything is clean

## NLP Processing
# Sentiment Analysis
# LDA
# POS taging/Similarity

## Front End
# Django or Flask additions

Test
ANOTHER TEST


FINAL TEST!