## Training song embeddings with Doc2Vec 

In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import re

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
remove_these = set(stopwords.words('english'))

In [2]:
def list_of_words(i, data):
    regex = re.compile('[^a-z]')
    lines = [x.strip().lower() for x in data['text'][i].split('\n') if x.strip().lower() not in  ['']]
    words = [x.split(' ') for x in lines]
    word_list = [word for line in words for word in line]
    word_just_letters = [regex.sub('', word) for word in word_list]
    word_just_letters = [w for w in word_just_letters if 'chorus' not in w]
    word_just_letters = [w for w in word_just_letters if 'verse' not in w]
    word_just_letters = [w for w in word_just_letters if 'tom' not in w]
    word_just_letters = [w for w in word_just_letters if 'mark' not in w]
    word_just_letters = [w for w in word_just_letters if 'travis' not in w]
    word_just_letters = [w for w in word_just_letters if (len(w) > 1 or w in ['a','i'])]
    word_just_letters = [w for w in word_just_letters if w not in [regex.sub('', x) for x in remove_these]]
    return ' '.join(word_just_letters)

In [3]:
full_df = pd.read_csv('tom.csv')
full_df.columns = ['song','year','artist','genre','text']
print(full_df.shape)
full_df.head()

(193, 5)


Unnamed: 0,song,year,artist,genre,text
0,13-miles,2007,blink-182,Rock,13 miles down the road lives a young boy\nHe's...
1,21-days,2006,blink-182,Rock,My mind wanders as I'm trying not to fall in l...
2,a-little-s-enough,2006,angels-and-airwaves,Rock,When all is said and done\nWill we still feel ...
3,a-new-hope,2006,blink-182,Rock,I've got her in my head\nAt night when I go to...
4,adam-s-song,2000,blink-182,Rock,I never thought I'd die alone\nI laughed the l...


In [4]:
full_df['cleaned_text'] = full_df.index.map(lambda i: list_of_words(i, data=full_df))
print(full_df.shape)
full_df.head()

(193, 6)


Unnamed: 0,song,year,artist,genre,text,cleaned_text
0,13-miles,2007,blink-182,Rock,13 miles down the road lives a young boy\nHe's...,miles road lives young boy hes got jet black h...
1,21-days,2006,blink-182,Rock,My mind wanders as I'm trying not to fall in l...,mind wanders im trying fall love cause every t...
2,a-little-s-enough,2006,angels-and-airwaves,Rock,When all is said and done\nWill we still feel ...,said done still feel pain inside scars go away...
3,a-new-hope,2006,blink-182,Rock,I've got her in my head\nAt night when I go to...,ive got head night go bed know sounds lame gir...
4,adam-s-song,2000,blink-182,Rock,I never thought I'd die alone\nI laughed the l...,never thought id die alone laughed loudest who...


In [5]:
# get training data
data = full_df['cleaned_text'].tolist()
train_corpus = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in list(enumerate(data))]
# build model
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
# train model
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [6]:
# for each song, store ranks of all songs by how similar they are to current song
ranks = {}
for doc_id in range(len(train_corpus)):
    # fit lyrics with model weights
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    # order by similarity
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    ranks[doc_id] = sims

In [7]:
# start with a song, and get n more
n = 20
doc_id = 88 # np.random.choice(range(len(full_df)))
songs = [doc_id]
for i in range(n):
    current_song = songs[i]
    for j in range(len(ranks[current_song])):
        next_song = int(ranks[current_song][j][0])
        if next_song != current_song:
            if next_song not in songs:
                break
    songs.append(next_song)
    
for song_id in songs:
    print(song_id)
    print(full_df.loc[song_id,'song'])
#     print(full_df.loc[song_id,'text'])
    print('---------------------------------------------')

88
kings-of-the-weekend
---------------------------------------------
176
up-all-night
---------------------------------------------
191
young-london
---------------------------------------------
84
it-hurts
---------------------------------------------
77
hey-im-sorry
---------------------------------------------
81
i-m-sorry
---------------------------------------------
45
does-my-breath-smell
---------------------------------------------
64
fighting-the-gravity
---------------------------------------------
140
start-the-machine
---------------------------------------------
94
lifeline
---------------------------------------------
161
the-war
---------------------------------------------
43
distraction
---------------------------------------------
124
rite-of-spring
---------------------------------------------
115
paralyzed
---------------------------------------------
106
my-heroine
---------------------------------------------
172
tunnels
------------------------------------------