In [4]:
import pandas as pd
from pymongo import  MongoClient

In [5]:
client = MongoClient('localhost', 27017)
db = client.lyrics
coll = db.yearly

### load data

In [6]:
# load data from mongodb into pandas
data = coll.find()
song_lyrics = pd.DataFrame(list(data))
song_lyrics.head()

Unnamed: 0,_id,artist,lyrics,title,track_id,year
0,5827dca77aa2eb0ad91b8fdc,Bukka White,I was over in Aberdeen\nOn my way to New Orlea...,Aberdeen Mississippi Blues,TRHRKYP128F4280BB1,1940
1,5827dca77aa2eb0ad91b8fdd,Bukka White,When a man gets trouble in his mind\nHe wanna ...,Sleepy Man Blues,TRCAHZD128F4280BC1,1940
2,5827dca77aa2eb0ad91b8fde,Bessie Smith,Woke up this mornin' when chickens was crowin'...,Young Woman's Blues,TRJBDVE128F9306FDB,1940
3,5827dca77aa2eb0ad91b8fdf,Bukka White,I'm taken down with the fever and it won't let...,High Fever Blues,TRRRGCS128F4280BB6,1940
4,5827dca77aa2eb0ad91b8fe2,Bukka White,"Hey-eee, come on you women\nLet's a do the the...",Bukka's Jitterbug Swing,TRXZHEC128F4280BB2,1940


### pre-processing pipeline

In [87]:
# nltk processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.stem
from string import punctuation
import re
def lyric_preprocessor(lyric, stem=False):
    stop_words = set(stopwords.words('english') + list(punctuation))
    re_replace = {
        r"\bdon't\b": "do not",
        r"\bdoesn't\b": "does not",
        r"\bdidn't\b": "did not",
        r"\bhasn't\b": "has not",
        r"\bhaven't\b": "have not",
        r"\bhadn't\b": "had not",
        r"\bwon't\b": "will not",
        r"\bwouldn't\b": "would not",
        r"\bcan't\b": "can not",
        r"\bcannot\b": "can not",
        r"\bain't\b": "is not"
    }
    
    lyric = lyric.lower()
    for r, replacement in re_replace.items():
        lyric = re.sub(r, replacement, lyric)
        
    lyric_words = word_tokenize(lyric)
    lyric_words_clean = [word for word in lyric_words if word not in stop_words]
    if stem:
        stemmer = nltk.stem.SnowballStemmer('english')
        return [stemmer.stem(word) for word in lyric_words_clean]
    
    # not stemmed
    return lyric_words_clean


# build a scikit-learn transformer so things play nicely with sklearn
from sklearn.base import BaseEstimator, TransformerMixin
class LyricPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, stop_words=None, lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        self.stopwords = stop_words or set(stopwords.words('english') + list(punctuation))
        self.stemmer = nltk.stem.SnowballStemmer('english')
        self.re_replace = {
            r"\bdon't\b": "do not",
            r"\bdoesn't\b": "does not",
            r"\bdidn't\b": "did not",
            r"\bhasn't\b": "has not",
            r"\bhaven't\b": "have not",
            r"\bhadn't\b": "had not",
            r"\bwon't\b": "will not",
            r"\bwouldn't\b": "would not",
            r"\bcan't\b": "can not",
            r"\bcannot\b": "can not",
            r"\bain't\b": "is not"
            }
        
    def fit(self, X, y=None):
        return self
    
    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]
    
    def transform(self, X):
        return [
            list(self.pre_process(doc)) for doc in X
            ]
        
    def pre_process(self, lyric):
        # replacements
        for r, replacement in self.re_replace.items():
            lyric = re.sub(r, replacement, lyric)
        
        for token in word_tokenize(lyric):
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            if token in self.stopwords:
                continue
                
            stemmed_token = self.stemmer.stem(token)
            yield stemmed_token

In [91]:
docs = song_lyrics['lyrics'] # a series

In [92]:
processed_lyric = lyric_preprocessor(docs[0], True)

### generate features 

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg
vectorizer = Pipeline([
        ('preprocessor', LyricPreprocessor()),
        ('vectorizer', TfidfVectorizer(
            tokenizer = identity, preprocessor=None, lowercase=False))
    ])

In [95]:
# Feature extraction (vectorization)
lyric_vec = vectorizer.fit_transform(docs)

In [97]:
# check
lyric_vec # 54k features!

<12664x54829 sparse matrix of type '<class 'numpy.float64'>'
	with 742245 stored elements in Compressed Sparse Row format>

In [98]:
# initial model
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 10, init = 'k-means++', max_iter=100, n_init=1, verbose = True)

In [99]:
# Generate cluster
km.fit(lyric_vec)

Initialization complete
Iteration  0, inertia 22558.591
Iteration  1, inertia 12058.019
Iteration  2, inertia 11998.160
Iteration  3, inertia 11969.858
Iteration  4, inertia 11957.127
Iteration  5, inertia 11949.572
Iteration  6, inertia 11944.732
Iteration  7, inertia 11941.894
Iteration  8, inertia 11939.977
Iteration  9, inertia 11937.260
Iteration 10, inertia 11934.707
Iteration 11, inertia 11932.411
Iteration 12, inertia 11930.304
Iteration 13, inertia 11928.741
Iteration 14, inertia 11927.388
Iteration 15, inertia 11926.279
Iteration 16, inertia 11924.990
Iteration 17, inertia 11923.009
Iteration 18, inertia 11919.705
Iteration 19, inertia 11916.276
Iteration 20, inertia 11915.106
Iteration 21, inertia 11914.586
Iteration 22, inertia 11914.210
Iteration 23, inertia 11913.987
Iteration 24, inertia 11913.803
Iteration 25, inertia 11913.653
Iteration 26, inertia 11913.547
Iteration 27, inertia 11913.457
Iteration 28, inertia 11913.340
Iteration 29, inertia 11913.285
Iteration 30, in

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)

In [100]:
import numpy as np
np.unique(km.labels_, return_counts=True) # get number of articles in each cluster

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32),
 array([2088,  317, 1567,  787, 3899,  839,  251,  492, 1859,  565]))

In [103]:
text = {}
for i, cluster in enumerate(km.labels_):
    doc = docs[i]
    if cluster not in text.keys():
        text[cluster] = doc
    else:
        text[cluster] += doc

In [105]:
## analyze words from each cluster (topics)
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest

keywords = {}
counts = {}
for cluster in range(10):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent = [word for word in word_sent]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster] = freq

In [109]:
### get unique keywords from each cluster
unique_keys = {}
for cluster in range(10):
    other = list(set(range(20)) - set([cluster]))
    keys_other = set(keywords[other[0]]).union(set(keywords[other[1]]))
    unique = set(keywords[cluster]) - keys_other
    unique_keys[cluster] = nlargest(20, unique, key=counts[cluster].get)

In [110]:
unique_keys

{0: ['away',
  'night',
  'day',
  'eyes',
  'through',
  'our',
  'dream',
  'heart',
  'us',
  'light',
  'hear',
  'would',
  'only',
  'again',
  'feel'],
 1: ["''", '``', 'said', 'had', 'then', 'him', "'d", 'did', ':'],
 2: ['right', 'good', 'make', 'too', 'ta', 'girl', "'cause", 'or', 'baby'],
 3: ['la',
  'de',
  'que',
  'un',
  'y',
  'e',
  'en',
  'el',
  'tu',
  'se',
  'le',
  'che',
  'te',
  'il',
  'les',
  'et',
  'mi',
  'non',
  'je',
  'di'],
 4: ['baby',
  'little',
  'hey',
  'want',
  'make',
  'right',
  'girl',
  'da',
  'too',
  'gon',
  '-'],
 5: ['baby',
  'want',
  'need',
  'give',
  'why',
  'make',
  'more',
  'girl',
  'ever',
  'been',
  'loving'],
 6: ['long',
  'been',
  'gone',
  'too',
  'baby',
  'gon',
  'home',
  'wrong',
  'wo',
  'lord',
  'alone',
  'make'],
 7: ['baby',
  'want',
  'girl',
  'good',
  'right',
  'need',
  'hey',
  'make',
  'gon',
  'been',
  'give',
  'please',
  'little',
  'wo',
  'wan',
  "'cause"],
 8: ['want',
  'make'

### Build a classifier for new lyrics

In [116]:
# build a classifier using KNN
new_lyric = docs[0]
print(new_lyric)

I was over in Aberdeen
On my way to New Orlean
I was over in Aberdeen
On my way to New Orlean
Them Aberdeen women told me
Will buy my gasoline

Hey, two little women
That I ain't ever seen
They has two little women
That I ain't never seen
These two little women
Just from New Orlean

Ooh, sittin' down in Aberdeen
With New Orlean on my mind
I'm sittin' down in Aberdeen
With New Orlean on my mind
Well, I believe them Aberdeen women
Gonna make me lose my mind, yeah

Aber-deen is my home
But the mens don't want me around
Aberdeen is my home
But the men don't want me around
They know I will take these women
An take them outta town

Listen, you Aberdeen women
You know I ain't got no dime
Oh-oh listen you women
You know'd I ain't got no dime
They been had the po' boy
All up and down


In [120]:
new_lyric_vector = vectorizer.transform([new_lyric])
new_lyric_vector

<1x54829 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>

In [125]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(lyric_vec, km.labels_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [130]:
new_lyric_label = classifier.predict(new_lyric_vector)

In [131]:
new_lyric_label

array([4], dtype=int32)

In [132]:
similar_indx = (km.labels_==new_lyric_label).nonzero()[0]

In [133]:
len(similar_indx)

3899

In [135]:
import scipy as sp
recommendations = []
for i in similar_indx:
    dist = sp.linalg.norm((new_lyric_vector - lyric_vec[i]).toarray())
    recommendations.append((dist, docs[i]))
recommendations = sorted(recommendations)
print(len(recommendations))

3899


In [137]:
print(recommendations[0])

(2.9897696156037272e-16, "I was over in Aberdeen\nOn my way to New Orlean\nI was over in Aberdeen\nOn my way to New Orlean\nThem Aberdeen women told me\nWill buy my gasoline\n\nHey, two little women\nThat I ain't ever seen\nThey has two little women\nThat I ain't never seen\nThese two little women\nJust from New Orlean\n\nOoh, sittin' down in Aberdeen\nWith New Orlean on my mind\nI'm sittin' down in Aberdeen\nWith New Orlean on my mind\nWell, I believe them Aberdeen women\nGonna make me lose my mind, yeah\n\nAber-deen is my home\nBut the mens don't want me around\nAberdeen is my home\nBut the men don't want me around\nThey know I will take these women\nAn take them outta town\n\nListen, you Aberdeen women\nYou know I ain't got no dime\nOh-oh listen you women\nYou know'd I ain't got no dime\nThey been had the po' boy\nAll up and down")


In [138]:
print(recommendations[1])

(1.1986508173334078, "It's time I'm walkin' to New Orleans\nI'm walkin' to New Orleans\nI'm going to need two pair of shoes\nWhen I get through walkin' these blues\nWhen I get back to New Orleans\n\nI've got my suitcase in my hand\nNow, ain't that a shame\nI'm leavin' here today\nYes, I'm goin' back home to stay\nYes, I'm walkin' to New Orleans\n\nYou used to be my honey\nTill you spent all my money\nNo use for you to cry\nI'll see you bye and bye\nCause I'm walkin' to New Orleans\n\nI've got no time for talkin'\nI've got to keep on walkin'\nNew Orleans is my home\nThat's the reason why I'm goin'\nYes, I'm walkin' to New Orleans\n\nI'm walkin' to New Orleans\nI'm walkin' to New Orleans\nI'm walkin' to New Orleans")


In [139]:
print(recommendations[-1])

(1.4142135623730956, 'Yaralı dizlerim koşamam ki\nKapalı yollarından alkamam ki\nUnutkan nehrimin \nYolumu sormadan bulamam ki\nKarlı dağlarında doğamam ki\nSaklı kentinin\n\nÇok üzülme, çok susma\n  Çok terleme, çok susma\n  Çok da kitap okuma     dedi annem\n \nÇok terleme çok yorulma\n  Girdaplarında boğulma\n  Yalnızlığına da çok alışma\n\nGüneşim olmaden göremem ki\nAy tutulurken uyuyamam ki\nKaranlık olsa da\nBen herkesi sevemem ki\nSevmeden de yaşayamam ki\nYanlış olsa da')
