In [229]:
import boto3
import random
import os
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Download Model and Lyrics Dataframe from S3

In [230]:
s3 = boto3.client('s3', 
                  aws_access_key_id=os.environ['AWS_ACCESS_KEY'], 
                  aws_secret_access_key=os.environ['AWS_SECRET_KEY'])

In [231]:
s3.download_file('metis-project-kojak-bucket', 'all_lyrics_dataframe.pickle', '../Data_Files/all_lyrics_dataframe.pickle')

In [232]:
s3.download_file('metis-project-kojak-bucket', 'song2vec_model_v2', '../Data_Files/song2vec_model_v2')

In [233]:
with open('../Data_Files/song2vec_model_v2', 'rb') as f:
    model = pickle.load(f)

In [234]:
with open('../Data_Files/all_lyrics_dataframe.pickle', 'rb') as f:
    df = pickle.load(f)

In [235]:
len(df)

19917

# Get Weights Matrix

In [238]:
weights_df = pd.DataFrame(data=list(model.docvecs), columns=['w'+str(i) for i in range(150)])
weights_df.head()

Unnamed: 0,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,...,w140,w141,w142,w143,w144,w145,w146,w147,w148,w149
0,1.309852,-0.722063,-1.845626,0.060433,0.802953,0.797232,-0.046447,-0.896688,2.006723,-1.299478,...,-1.857087,-1.695151,-1.52,-3.002474,1.011228,-0.745409,1.602316,-2.012885,-4.402058,2.129371
1,1.389885,3.419014,1.563861,-0.173261,1.999009,-1.408875,-1.12061,0.380121,-0.079164,-1.228119,...,-0.098532,-0.225048,-1.280354,-1.92677,-0.512352,1.328039,0.284323,2.168764,0.410956,-2.421606
2,1.228872,-0.976769,1.294796,0.69412,0.009372,0.755142,-0.628633,0.224527,2.397843,-0.188266,...,-0.910294,-0.624909,0.025676,-0.487623,-0.066341,-0.045428,-0.240223,1.385183,-2.112195,2.928612
3,-2.127066,0.229084,0.896153,-0.338795,1.786532,0.173069,-1.157274,-0.097621,2.368323,-0.558968,...,1.202439,4.292933,-3.682325,-2.141923,2.101008,1.668861,-1.67015,0.830344,-0.764181,4.75446
4,-2.147542,-2.364923,-0.580354,-3.391496,1.528835,-0.597462,-1.709017,-0.451522,0.661615,-1.160594,...,-0.865367,-1.663213,0.004108,-3.587945,1.608335,2.78728,-1.481623,-1.066067,0.175758,1.857074


# Dimensionality Reduction

In [277]:
components = 25

In [278]:
svd = TruncatedSVD(n_components=components, n_iter=10, random_state=42)
weights_svd = svd.fit_transform(weights_df) 

In [279]:
weights_svd_df = pd.DataFrame(data=weights_svd, columns=['x'+str(i) for i in range(components)])
weights_svd_df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24
0,6.318724,1.9465,4.130989,-2.038212,2.771502,0.942558,-0.39546,-3.703319,-2.462871,0.647212,...,0.948141,-0.709242,0.04911,-2.760571,-1.172213,-0.431424,0.601739,1.239263,1.015561,0.552901
1,7.765625,-0.506461,1.944812,0.769629,-3.114601,-2.757187,0.440005,0.327375,-5.751104,0.666494,...,-3.052673,0.5336,-1.786256,1.00872,2.356063,-2.909463,-2.17223,3.243688,0.447866,0.450233
2,5.742418,-1.509559,1.542711,-1.679113,2.38528,-0.502519,-0.843322,0.443783,0.426616,-0.987517,...,-0.208419,0.342447,0.395179,-0.452791,1.116307,1.17894,0.510454,-0.501519,0.444036,1.220786
3,11.007117,-1.90207,-1.56686,-3.731142,-0.989067,0.608182,1.941592,2.460689,2.168383,0.497704,...,5.354533,0.643897,1.015109,-1.325302,1.822555,5.479691,2.851597,2.290047,0.090016,2.984782
4,6.636757,-1.833661,-1.459982,3.947194,-0.039765,-2.359881,2.378551,1.507767,-3.28971,1.03235,...,-1.825478,-0.312048,0.21472,-0.19224,-0.212977,-0.349321,-2.401229,1.801834,0.50329,-0.339052


# Other Features

In [280]:
df.columns

Index(['album_art', 'album_name', 'duration_ms', 'energy', 'liveness',
       'loudness', 'speechiness', 'tempo', 'track_id', 'track_name',
       'artist_popularity', 'artist_name', 'artist_id', 'lyrics'],
      dtype='object')

In [281]:
track_metadata = df[['energy', 'speechiness', 'tempo']]
track_metadata.head()

Unnamed: 0,energy,speechiness,tempo
0,0.809,0.566,146.216
1,0.649,0.455,127.635
2,0.813,0.226,102.926
3,0.665,0.146,138.036
4,0.586,0.216,89.998


In [282]:
ss = StandardScaler()
track_metadata_ss = pd.DataFrame(data=ss.fit_transform(track_metadata), columns=track_metadata.columns)

In [283]:
tracks = pd.concat([weights_svd_df, track_metadata_ss], axis=1)
tracks.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x18,x19,x20,x21,x22,x23,x24,energy,speechiness,tempo
0,6.318724,1.9465,4.130989,-2.038212,2.771502,0.942558,-0.39546,-3.703319,-2.462871,0.647212,...,-2.760571,-1.172213,-0.431424,0.601739,1.239263,1.015561,0.552901,0.667303,2.047631,1.010028
1,7.765625,-0.506461,1.944812,0.769629,-3.114601,-2.757187,0.440005,0.327375,-5.751104,0.666494,...,1.00872,2.356063,-2.909463,-2.17223,3.243688,0.447866,0.450233,-0.317251,1.335577,0.430864
2,5.742418,-1.509559,1.542711,-1.679113,2.38528,-0.502519,-0.843322,0.443783,0.426616,-0.987517,...,-0.452791,1.116307,1.17894,0.510454,-0.501519,0.444036,1.220786,0.691917,-0.133433,-0.339307
3,11.007117,-1.90207,-1.56686,-3.731142,-0.989067,0.608182,1.941592,2.460689,2.168383,0.497704,...,-1.325302,1.822555,5.479691,2.851597,2.290047,0.090016,2.984782,-0.218795,-0.646625,0.75506
4,6.636757,-1.833661,-1.459982,3.947194,-0.039765,-2.359881,2.378551,1.507767,-3.28971,1.03235,...,-0.19224,-0.212977,-0.349321,-2.401229,1.801834,0.50329,-0.339052,-0.704919,-0.197582,-0.742269


# Nearest Neighbors

In [290]:
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(tracks)

In [291]:
distances, indices = nn.kneighbors(tracks)

In [292]:
nn_d = {}
for track_idx, neighbors in enumerate(indices):
    nn_d[track_idx] = neighbors

# Get Similar Tracks

In [293]:
def get_similar_tracks(artist_index):
    test = nn_d[artist_index]
    print(test, '\n')
    for track in df.iloc[test].iterrows():
        artist_name = track[1]['artist_name']
        track_name = track[1]['track_name']
        lyrics = track[1]['lyrics']
        print(track_name)
        print(artist_name)
        print('===============\n')
        print(lyrics)
        print('\n--------------------------\n')

In [294]:
rand_artist = random.randint(0,19917)
print(rand_artist)
get_similar_tracks(13737)

10901
[13737 18581 17079  1834  8388   726  2340 10185 13256 14145] 

Real Friends
Kanye West

Real friends, how many of us?
How many of us, how many jealous? Real friends
It's not many of us, we smile at each other
But how many honest? Trust issues
Switched up the number, I can't be bothered
I cannot blame you for havin' an angle
I ain't got no issues, I'm just doin' my thing
Hope you're doin' your thing too
I'm a deadbeat cousin, I hate family reunions
Fuck the church up by drinkin' after communion
Spillin' free wine, now my tux is ruined
In town for a day, what the fuck we doin'?
Who your real friends? We all came from the bottom
I'm always blamin' you, but what's sad, you not the problem
Damn I forgot to call her, shit I thought it was Thursday
Why you wait a week to call my phone in the first place?
When was the last time I remembered a birthday?
When was the last time I wasn't in a hurry?

Tell me you want your tickets when it's gametime
Even to call your daughter on her FaceTime

In [214]:
def get_similar_tracks_2(artist_index):
    sims = model.docvecs.most_similar(artist_index, topn=model.docvecs.count)
    sim_ids = [pair[0] for pair in sims[0:10]]
    
    main_track = df.iloc[artist_index]
    print(main_track['artist_name'])
    print(main_track['track_name'])
    print('===============\n')
    print(main_track['lyrics'])
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n')
    
    for i, track in enumerate(df.iloc[sim_ids].iterrows()):
        artist_name = track[1]['artist_name']
        track_name = track[1]['track_name']
        lyrics = track[1]['lyrics']
        print(track_name)
        print(artist_name)
        print(sims[i][1])
        print('===============\n')
        print(lyrics)
        print('\n--------------------------\n')

In [215]:
get_similar_tracks_2(13737)

Kanye West
Real Friends

Real friends, how many of us?
How many of us, how many jealous? Real friends
It's not many of us, we smile at each other
But how many honest? Trust issues
Switched up the number, I can't be bothered
I cannot blame you for havin' an angle
I ain't got no issues, I'm just doin' my thing
Hope you're doin' your thing too
I'm a deadbeat cousin, I hate family reunions
Fuck the church up by drinkin' after communion
Spillin' free wine, now my tux is ruined
In town for a day, what the fuck we doin'?
Who your real friends? We all came from the bottom
I'm always blamin' you, but what's sad, you not the problem
Damn I forgot to call her, shit I thought it was Thursday
Why you wait a week to call my phone in the first place?
When was the last time I remembered a birthday?
When was the last time I wasn't in a hurry?

Tell me you want your tickets when it's gametime
Even to call your daughter on her FaceTime
Even when we was young I used to make time
Now we be way too busy jus

In [295]:
with open('../Data_Files/neighbors', 'wb') as f:
    pickle.dump(nn_d, f)