In [247]:
import pandas as pd
from pymongo import  MongoClient

In [248]:
client = MongoClient('localhost', 27017)
db = client.lyrics
coll = db.yearly

### load data

In [249]:
# load data from mongodb into pandas
data = coll.find()
song_lyrics = pd.DataFrame(list(data))
song_lyrics.head()

Unnamed: 0,_id,artist,lyrics,title,track_id,year
0,5827dca77aa2eb0ad91b8fdc,Bukka White,I was over in Aberdeen\nOn my way to New Orlea...,Aberdeen Mississippi Blues,TRHRKYP128F4280BB1,1940
1,5827dca77aa2eb0ad91b8fdd,Bukka White,When a man gets trouble in his mind\nHe wanna ...,Sleepy Man Blues,TRCAHZD128F4280BC1,1940
2,5827dca77aa2eb0ad91b8fde,Bessie Smith,Woke up this mornin' when chickens was crowin'...,Young Woman's Blues,TRJBDVE128F9306FDB,1940
3,5827dca77aa2eb0ad91b8fdf,Bukka White,I'm taken down with the fever and it won't let...,High Fever Blues,TRRRGCS128F4280BB6,1940
4,5827dca77aa2eb0ad91b8fe2,Bukka White,"Hey-eee, come on you women\nLet's a do the the...",Bukka's Jitterbug Swing,TRXZHEC128F4280BB2,1940


### Split training/testing

In [250]:
from sklearn.cross_validation import train_test_split
#from sklearn.pipeline import Pipeline
train, test = train_test_split(song_lyrics, test_size = 0.1)

In [174]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.7.zip (998kB)
[K    100% |████████████████████████████████| 1.0MB 1.4MB/s 
Building wheels for collected packages: langdetect
  Running setup.py bdist_wheel for langdetect ... [?25l- \ done
[?25h  Stored in directory: /home/ubuntu/.cache/pip/wheels/6f/8c/3b/ffa8151e27effd7de2a7d3194650d78fe6e4d4a3c175a74867
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.7
[33mYou are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### pre-processing pipeline

In [251]:
# nltk processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.stem
from string import punctuation
import re
def lyric_preprocessor(lyric, stem=False):
    stop_words = set(stopwords.words('english') + list(punctuation))
    re_replace = {
        r"\bdon't\b": "do not",
        r"\bdoesn't\b": "does not",
        r"\bdidn't\b": "did not",
        r"\bhasn't\b": "has not",
        r"\bhaven't\b": "have not",
        r"\bhadn't\b": "had not",
        r"\bwon't\b": "will not",
        r"\bwouldn't\b": "would not",
        r"\bcan't\b": "can not",
        r"\bcannot\b": "can not",
        r"\bain't\b": "is not"
    }
    
    lyric = lyric.lower()
    for r, replacement in re_replace.items():
        lyric = re.sub(r, replacement, lyric)
        
    lyric_words = word_tokenize(lyric)
    lyric_words_clean = [word for word in lyric_words if word not in stop_words]
    if stem:
        stemmer = nltk.stem.SnowballStemmer('english')
        return [stemmer.stem(word) for word in lyric_words_clean]
    
    # not stemmed
    return lyric_words_clean

    
# build a scikit-learn transformer so things play nicely with sklearn
from sklearn.base import BaseEstimator, TransformerMixin
class LyricPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, stop_words=None, lower=True, strip=True, repeat_replacer=None):
        self.lower = lower
        self.strip = strip
        self.stopwords = stop_words or set(stopwords.words('english') + list(punctuation))
        self.stemmer = nltk.stem.SnowballStemmer('english')
        self.repeat_replacer = repeat_replacer
        self.re_replace = {
            r"\bdon't\b": "do not",
            r"\bdoesn't\b": "does not",
            r"\bdidn't\b": "did not",
            r"\bhasn't\b": "has not",
            r"\bhaven't\b": "have not",
            r"\bhadn't\b": "had not",
            r"\bwon't\b": "will not",
            r"\bwouldn't\b": "would not",
            r"\bcan't\b": "can not",
            r"\bcannot\b": "can not",
            r"\bain't\b": "is not"
            }
        
    def fit(self, X, y=None):
        return self
    
    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]
    
    def transform(self, X):
        return [
            list(self.pre_process(doc)) for doc in X
            ]
        
    def pre_process(self, lyric):
        # replacements
        for r, replacement in self.re_replace.items():
            lyric = re.sub(r, replacement, lyric)
        
        for token in word_tokenize(lyric):
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            token = self.repeat_replacer(token) if self.repeat_replacer else token
            if token in self.stopwords:
                continue
                
            stemmed_token = self.stemmer.stem(token)
            yield stemmed_token

# normalize by removing repeating characters
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word) # keep replacing
        else:
            return repl_word

### English only

In [288]:
print(song_lyrics['lyrics'][43])

Ah il sait tout mon petit doigt
Tes parties avec mes revenus
Que d'allées venues

Vers quel crayon s'est-elle taillée désormais
Que vais-je faire de cet abandon
À qui en faire don ?

Bombez le torse bombez !
Prenez des forces bombez !
Bombez le torse bombez !
Ca c'est my way

I know, I know
Sa turne a l'air habitée
Alors qu'on sait que personne n'y vit
À qui se fier ?

Bombez le torse bombez !
Prenez des forces bombez !
Ca c'est my way

Les paras sont normaux sous la tonnelle où rôde
Où rôde le Japon
Fidèle à ses traditions

Dans un dernier effort
L'empereur se soulève
Donne à boire au dragon
Et scrute les environs

Ah l'enfant que j'ai dans le dos
Fait se retourner tous les badauds
Piler les autos

Bombez le torse bombez !
Prenez des forces bombez !
Ca c'est my way

ouistiti
T'as pas souri quand elle a ri
Tant pis
Les alterts et les égaux
Ca m'est égal ça m'est ego

Bombez le torse bombez !


In [289]:
print(song_lyrics['lyrics'][6])

Ê
Onde anda você
Faz tempo que a gente não se vê
Ê
Batumaré
É sempre bom voltar
A luz se acendeu de novo
A porta nunca vai
Se fechar
Ê
Batumaré
Qualquer palavra serve
Pra dizer
Dessa alegria
A luz se acendeu de novo
A porta nunca vai se fechar
Ê
Batumaré


In [252]:
# remove non-english rows
#english_only = song_lyrics[is_english_sent(song_lyrics[['lyrics']]) ]
#is_english_sent(song_lyrics.lyrics)
#is_english_sent(song_lyrics.lyrics[12638])
from langdetect import detect
#song_lyrics[['lyrics']].apply(detect, axis=1)
#song_lyrics[['lyrics']]
song_lyrics.lyrics[1045]

'1...\n2...\n3...\n4...\n5...\n6...\n7...\n8...\n9...\n10...\n11...\n12...\n13...\n14...\n15...\n16...\n17...\n18...\n19...\n20...\n21...\n22...\n23...\n24...\n25...\n26...\n27...\n28...\n29...\n30...\n31...\n32...\n33...\n34...\n35...\n36...\n37...\n38...\n39'

In [253]:
song_lyrics = song_lyrics.drop(1045)

In [254]:
song_lyrics.lyrics[5338]

'1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20\n21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39'

In [255]:
song_lyrics = song_lyrics.drop(5338)

In [256]:
song_lyrics_en = song_lyrics[song_lyrics.apply(lambda x: detect(x['lyrics']) == 'en', axis=1)]

In [257]:
song_lyrics_en.count()

_id         11600
artist      11600
lyrics      11600
title       11600
track_id    11600
year        11600
dtype: int64

In [258]:
docs = song_lyrics_en['lyrics']

In [287]:
docs.index # some indices are missing! ouch!

Int64Index([    0,     1,     2,     3,     4,     5,     7,     8,     9,
               10,
            ...
            12654, 12655, 12656, 12657, 12658, 12659, 12660, 12661, 12662,
            12663],
           dtype='int64', length=11600)

In [301]:
# resetting index
docs = docs.reset_index(drop=True)

# Features 

#### create & dump vectorizer

In [259]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg
vectorizer = Pipeline([
        ('preprocessor', LyricPreprocessor()),
        ('vectorizer', TfidfVectorizer(
            tokenizer = identity, preprocessor=None, lowercase=False))
    ])

In [260]:
# dumping the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

#### generate & dump feature vectors

In [261]:
from sklearn.externals import joblib
lyrics_features = vectorizer.fit_transform(docs)
joblib.dump(lyrics_features, 'lyrics_features.pkl')

['lyrics_features.pkl',
 'lyrics_features.pkl_01.npy',
 'lyrics_features.pkl_02.npy',
 'lyrics_features.pkl_03.npy']

# K-Means

In [364]:
# initial model
num_clusters = 30
from sklearn.cluster import KMeans
km = KMeans(n_clusters = num_clusters, init = 'k-means++', max_iter=100, n_init=1, verbose = True)

In [365]:
# Generate clusters
km.fit(lyrics_features)

Initialization complete
Iteration  0, inertia 19485.615
Iteration  1, inertia 10851.962
Iteration  2, inertia 10765.908
Iteration  3, inertia 10724.681
Iteration  4, inertia 10704.917
Iteration  5, inertia 10694.394
Iteration  6, inertia 10688.806
Iteration  7, inertia 10684.509
Iteration  8, inertia 10680.408
Iteration  9, inertia 10675.986
Iteration 10, inertia 10671.749
Iteration 11, inertia 10667.199
Iteration 12, inertia 10664.571
Iteration 13, inertia 10662.201
Iteration 14, inertia 10659.946
Iteration 15, inertia 10658.868
Iteration 16, inertia 10658.497
Iteration 17, inertia 10658.319
Iteration 18, inertia 10658.197
Iteration 19, inertia 10658.080
Iteration 20, inertia 10657.991
Iteration 21, inertia 10657.904
Iteration 22, inertia 10657.815
Iteration 23, inertia 10657.707
Iteration 24, inertia 10657.614
Iteration 25, inertia 10657.526
Iteration 26, inertia 10657.455
Iteration 27, inertia 10657.388
Iteration 28, inertia 10657.349
Iteration 29, inertia 10657.302
Iteration 30, in

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=30, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)

#  Cluster analysis

In [366]:
import numpy as np
np.unique(km.labels_, return_counts=True) # get number of articles in each cluster

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], dtype=int32),
 array([ 207,  155,  157,  189, 1239,  274,   86,  369,  189,  600,  313,
         308,   54,   73,  323,  217,  203,  610,  208,  130,  350, 2875,
         167,  128,  156,  102,  259,   82, 1514,   63]))

### Separate documents by cluster

In [367]:
documents = {}
for i, cluster in enumerate(km.labels_):
    doc = docs[i]
    if cluster not in documents.keys():
        documents[cluster] = doc
    else:
        documents[cluster] += doc

### Get words in clusters

In [368]:
## analyze words from each cluster (topics)
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest
word_sent = word_tokenize(documents[0].lower())
word_sent = [word for word in word_sent]
freq = FreqDist(word_sent)
freq

FreqDist({'69': 1,
          'cried': 5,
          'loved': 13,
          "n't": 621,
          'friendstonight': 1,
          'parted': 3,
          'plane': 1,
          'handed': 1,
          'letters': 4,
          'thing': 16,
          'changin': 1,
          'malo': 1,
          'voix': 2,
          'passing': 2,
          'pretense': 1,
          'patriotic': 1,
          'everybody': 4,
          'mat': 2,
          'price': 1,
          'pursuit': 1,
          'fever': 2,
          'lights': 7,
          'wait': 29,
          'deaf': 1,
          'sweethearts': 2,
          'bridge': 6,
          'claimed': 1,
          'seek': 5,
          'nigth': 1,
          'has': 28,
          'down': 130,
          'illusion': 1,
          'conscience': 1,
          'feelings': 10,
          'reminding': 1,
          'start': 72,
          'ate': 3,
          'driving': 2,
          'headache': 2,
          'fair': 2,
          'thundering': 1,
          'dried': 1,
          'bullet':

In [369]:
nlargest(100, freq, key=freq.get)

['i',
 'you',
 ',',
 'the',
 'heart',
 'my',
 'and',
 'to',
 'a',
 'it',
 'me',
 "n't",
 'in',
 'love',
 'your',
 "'s",
 'of',
 'that',
 'do',
 'on',
 "'m",
 'be',
 'for',
 'but',
 'so',
 'break',
 'all',
 'know',
 'is',
 'can',
 "'ll",
 '.',
 'just',
 'with',
 'this',
 'we',
 "'re",
 'when',
 'if',
 'one',
 "'ve",
 'take',
 ')',
 '(',
 "'",
 'never',
 'no',
 'like',
 'now',
 'there',
 'come',
 'what',
 'up',
 'let',
 'only',
 'will',
 'was',
 'have',
 'ca',
 'tell',
 'are',
 'he',
 'down',
 '?',
 'got',
 'time',
 'oh',
 'go',
 'she',
 'not',
 'out',
 'could',
 'say',
 'from',
 'mine',
 'at',
 'baby',
 'make',
 'see',
 'want',
 'feel',
 'as',
 'how',
 'wo',
 'had',
 'again',
 'little',
 'lonely',
 'they',
 'hearts',
 'back',
 'would',
 "'d",
 'way',
 'where',
 'away',
 'her',
 'been',
 'still',
 'too']

In [370]:
## analyze words from each cluster (topics)
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest

stp_words = set(stopwords.words('english') + list(punctuation))
keywords = {}
counts = {}
for cluster in range(num_clusters):
    word_sent = word_tokenize(documents[cluster].lower())
    word_sent = [word for word in word_sent if word not in stp_words] # all words in the cluster
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster] = freq

## get unique words from each cluster

In [371]:
list(set(range(10)) - set([0]))

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [372]:
### get unique keywords from each cluster
themes = {}
for cluster in range(num_clusters):
    other = list(set(range(10)) - set([cluster]))
    keywords_other = set([])
    for idx, oc in enumerate(other):
        keywords_other.union( set(keywords[other[idx]]))
    unique = set(keywords[cluster]) - keywords_other
    themes[cluster] = nlargest(15, unique, key=counts[cluster].get)

In [373]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
#pp.pprint(themes)
#print(themes)

def print_themes(theme):
    for idx, t in theme.items():
        print( "cluster: ", str(idx))
        print( "theme: ", t)
        print("\n")
        
print_themes(themes)

cluster:  0
theme:  ['heart', "n't", 'love', "'s", "'m", 'break', 'know', "'ll", "'re", 'one', 'take', "'ve", 'never', 'like', 'come']


cluster:  1
theme:  ['sun', "'s", "n't", 'shine', 'let', "'m", 'see', 'know', "'re", 'like', 'love', 'come', 'light', "'ll", 'day']


cluster:  2
theme:  ['na', 'wan', "n't", "'s", 'know', 'love', "'m", 'baby', 'go', "'re", 'get', 'want', "'ll", 'like', 'see']


cluster:  3
theme:  ['...', "'s", "n't", 'love', "'m", 'know', 'oh', 'never', 'baby', 'yeah', "'re", "'ll", 'like', 'go', 'got']


cluster:  4
theme:  ["n't", "'s", "'m", "'ll", 'know', "'re", 'never', "'ve", 'love', 'one', 'like', 'see', 'go', 'away', 'say']


cluster:  5
theme:  ["''", "'s", '``', "n't", 'said', "'m", "'ll", 'got', 'know', 'say', 'like', "'re", 'one', 'go', 'back']


cluster:  6
theme:  ['shake', 'train', "n't", "'s", 'baby', "'m", 'get', 'got', 'right', 'know', 'see', 'home', "'em", 'time', 'na']


cluster:  7
theme:  ['got', "n't", "'s", "'ve", 'ta', "'m", 'know', 'get', '

### Conclusion
More iteration on KMeans to discover a good K for this corpus is a good next step.

# KNN Classification of new lyrics

In [374]:
# build a classifier using KNN
new_lyric = docs[0]
print(new_lyric)

I was over in Aberdeen
On my way to New Orlean
I was over in Aberdeen
On my way to New Orlean
Them Aberdeen women told me
Will buy my gasoline

Hey, two little women
That I ain't ever seen
They has two little women
That I ain't never seen
These two little women
Just from New Orlean

Ooh, sittin' down in Aberdeen
With New Orlean on my mind
I'm sittin' down in Aberdeen
With New Orlean on my mind
Well, I believe them Aberdeen women
Gonna make me lose my mind, yeah

Aber-deen is my home
But the mens don't want me around
Aberdeen is my home
But the men don't want me around
They know I will take these women
An take them outta town

Listen, you Aberdeen women
You know I ain't got no dime
Oh-oh listen you women
You know'd I ain't got no dime
They been had the po' boy
All up and down


#### vectorize new lyrics

In [375]:
new_lyric_vector = vectorizer.transform([new_lyric])
new_lyric_vector

<1x29300 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>

#### build classifier from currenly vectorized lyrics and labels

In [376]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(lyrics_features, km.labels_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [377]:
# dump
joblib.dump(classifier, 'knn_classifier.pkl')

['knn_classifier.pkl',
 'knn_classifier.pkl_01.npy',
 'knn_classifier.pkl_02.npy',
 'knn_classifier.pkl_03.npy',
 'knn_classifier.pkl_04.npy',
 'knn_classifier.pkl_05.npy']

#### make a prediction

In [378]:
new_lyric_label = classifier.predict(new_lyric_vector)

In [379]:
new_lyric_label # label 2: "love", "baby", "make", "like", "take"

array([1], dtype=int32)

### We can now do recommendations
We know which cluster this song is in so we can check out the other songs in that cluster.

In [380]:
similar_indx = (km.labels_==new_lyric_label).nonzero()[0]

In [381]:
len(similar_indx)

155

### get recommendations
Get the closest songs in vector space

In [382]:
import scipy as sp
recommendations = []
for i in similar_indx:
    dist = sp.linalg.norm((new_lyric_vector - lyrics_features[i]).toarray())
    recommendations.append((dist, docs[i]))
recommendations = sorted(recommendations)
print(len(recommendations))

155


In [384]:
def print_recommendations(records, num=5):
    """ prints a table of recommended lyrics"""
    for i in ['distance', 'recommended']:
        print('{:>20s}'.format(i), end=' ')
    print()
    for cnt in range(num):
        dist, song = records[cnt]
        print('{:>20s}'.format(str(dist)), end='')
        print('{:>20s}'.format(song), end='')
        print("\n\n")
        
 

print_recommendations(recommendations)

            distance          recommended 
       1.23670444121There is a house in New Orleans
They call the Rising Sun
And it's been the ruin of many a poor boy
And, God, I know I'm one

Oh mother tell my sister
Don't do what I have done
Tell her to show me how down in New Orleans
They call it the Rising Sun
They call it the Rising Sun

My daddy was a tailor
He sews on them new bluejeans
And my mama, she was a drunkard, lord
Drinkin' down in New Orleans
Drinkin' down in New Orleans

Now the only thing a gambler needs
Is a suitcase and trunk
And the only time he's satisfied
Is when he's on a drunk

I'm going back to New Orleans
My race is almost run
I don't want to spend the rest of my live long days
Beneath the Rising Sun
Beneath the Rising Sun

There is a house in New Orleans
They call the Rising Sun
And it's been the ruin of many a poor boy
And, God, I know I'm one


       1.25522589042There is a house in New Orleans
They call the Rising Sun
And it's been the ruin of many a poor bo

In [385]:
print(recommendations[-1][1]) # last one in that cluster of 4080

The morning sun comes thru my window
All night long I have been waiting
We who are constantly moving
Leaving part of us behind

She moves across the room with easy grace
Mona lisa smiles up on her face
I who am completely mesmerized
By the sunlight in her eyes

Morning sun comes thru my windows
All night long I have been waiting
We who are constantly moving
Leaving part of us behind

Moves across the room with easy grace
Mona lisa smiles up on her face
I who am completely mesmerized
By the sunlight in her eyes

And the morning sun comes thru my window
All night long I have been waiting
We who are constantly moving
Leaving part of us behind


# Conclusions

1. data cleaning and exploration is an iterative process. The problems I found in my data include: foreign languages, 
    non alpha-numberic character strings, repeated records, hard to remove stop words and more.
2. Trying to make sense of the outcome and iterating on it. For instance, I rebuilt the clustering model a number of
    times with different k values to come up with meaningful 'topics'
3. Model evaluation is not straightforward for unsupervised learning. However, we can probably use a supervised approach 
   such as done above with knn and look at the outcome, or visualize the clusters as was done in topic modeling
4. ...