In [39]:
#Import Packages

# Data analysis
import pandas as pd
import numpy as np
import requests

# Data cleaning
import re

# Tokenizing words
import spacy
from spacy.tokenizer import Tokenizer
from collections import Counter

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Encoding
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Wrangling data

In [3]:
spotify_data = pd.read_csv('/Users/yinmialas/Desktop/dspt7_u4_spotify_proyect/archive_data1/SpotifyAudioFeaturesApril2019.csv')
print(spotify_data.shape)
spotify_data.head()

(130663, 17)


Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [4]:
spotify_data = spotify_data.sample(frac=.2, axis = 0)
spotify_data.shape

(26133, 17)

In [5]:
# Copy dataframe
spotify_data2 = spotify_data.copy()

In [6]:
# Combine text columns for tokenization
col_combine = ['artist_name', 'track_name']

# Lowercase and regex
for each in col_combine:
  spotify_data2[each]= spotify_data2[each].apply(lambda x:x.lower())
  spotify_data2[each]= spotify_data2[each].apply(lambda x: re.sub('[^a-zA-Z 0-9]', ' ', x))

# Combine two columns with text
spotify_data2['combined_text'] = spotify_data2['combined_text'] = spotify_data2['artist_name'] + spotify_data2['track_name'] 

# Remove repetitive columns
spotify_data2= spotify_data2.drop(['artist_name', 'track_name', 'track_id'], axis = 1)

In [7]:
spotify_data2.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,combined_text
57763,0.0252,0.817,178363,0.664,0.0,5,0.126,-6.372,1,0.2,107.945,4,0.592,58,s waveywavey
1194,0.258,0.791,225250,0.698,0.0,8,0.214,-4.062,1,0.264,119.992,3,0.511,0,sean paultouch pop mix
28690,0.972,0.477,198026,0.141,5.7e-05,7,0.0792,-10.516,1,0.0803,76.777,5,0.301,4,jesget me through the night acoustic mix
81233,0.0467,0.408,360253,0.906,3e-06,1,0.0914,-6.335,0,0.0858,172.17,4,0.283,4,burnsink together
109081,0.0411,0.761,193565,0.896,5.8e-05,1,0.0725,-3.888,1,0.0587,114.994,4,0.295,41,why don t wehooked borgeous remix


In [8]:
# dropping combine column
spotify_data2_sub = spotify_data2.drop(['combined_text'], axis = 1)
spotify_data2_sub.shape

(26133, 14)

In [9]:
spotify_data2_sub.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
57763,0.0252,0.817,178363,0.664,0.0,5,0.126,-6.372,1,0.2,107.945,4,0.592,58
1194,0.258,0.791,225250,0.698,0.0,8,0.214,-4.062,1,0.264,119.992,3,0.511,0
28690,0.972,0.477,198026,0.141,5.7e-05,7,0.0792,-10.516,1,0.0803,76.777,5,0.301,4
81233,0.0467,0.408,360253,0.906,3e-06,1,0.0914,-6.335,0,0.0858,172.17,4,0.283,4
109081,0.0411,0.761,193565,0.896,5.8e-05,1,0.0725,-3.888,1,0.0587,114.994,4,0.295,41


In [10]:
scaler = StandardScaler()
scaler.fit(spotify_data2_sub)
scaled_df = pd.DataFrame(scaler.transform(spotify_data2_sub))
print(scaled_df.shape)
scaled_df.head()

(26133, 14)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.920431,1.241942,-0.300879,0.367912,-0.620502,-0.061184,-0.414756,0.553341,0.79857,0.696899,-0.381228,0.235739,0.590167,1.701698
1,-0.248033,1.104754,0.122282,0.498612,-0.620502,0.772894,0.111216,0.908205,0.79857,1.208537,0.017183,-1.72,0.276764,-1.235341
2,1.814221,-0.552055,-0.123418,-1.642566,-0.620343,0.494868,-0.694478,-0.083264,0.79857,-0.260025,-1.411997,2.191479,-0.535761,-1.032787
3,-0.858333,-0.916131,1.340702,1.29819,-0.620493,-1.173288,-0.621559,0.559025,-1.252239,-0.216056,1.742782,0.235739,-0.605406,-1.032787
4,-0.874507,0.94646,-0.163679,1.259749,-0.620342,-1.173288,-0.734524,0.934935,0.79857,-0.432703,-0.148107,0.235739,-0.558976,0.840841


# Spacy

In [11]:
from pandas import Panel
from tqdm import tqdm
tqdm.pandas()

In [12]:
!python -m spacy download en_core_web_lg

You should consider upgrading via the '/Users/yinmialas/.local/share/virtualenvs/dspt7_u4_spotify_proyect-hy7WYrSs/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [13]:
#lemmas with spacy
nlp = spacy.load('en_core_web_lg', disable=['tagger', 'parser'])

# Wrap it all in a function
def get_lemmas(text):
    lemmas = []
    doc = nlp(text)
    # Something goes here :P
    doc_tokens = []
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas

In [15]:
# Get lemmas

spotify_data2['lemmas'] = spotify_data2['combined_text'].progress_apply(get_lemmas)
spotify_data2['lemmas'].head()

100%|██████████| 26133/26133 [00:58<00:00, 445.21it/s]


57763                             [s, waveywavey]
1194               [sean, paultouch,  , pop, mix]
28690          [jesget, night,   , acoustic, mix]
81233                                  [burnsink]
109081    [don, t, wehooked,   , borgeous, remix]
Name: lemmas, dtype: object

# TFIDF

In [17]:
# Set up TFIDF
# Instantiate vectorizer object

def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

tfidf = TfidfVectorizer(
    stop_words = 'english',
    # tokenizer = tokenize,
    ngram_range = (1,1),
    min_df = 1, 
    max_df = 0.9,
    max_features = 1000)

In [18]:
# Create a vocabulary and tf-idf score per document
text = spotify_data2['combined_text']
dtm = tfidf.fit_transform(text)

In [20]:
spotify_data2.shape

(26133, 16)

In [21]:
# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(26133, 1000)


Unnamed: 0,03,10,100,1008,1080,11,117,12,13,14,...,yellow,yfn,ynw,yo,young,youngboy,yung,yuuki,zauberfl,zu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
combined_df=pd.concat([scaled_df, dtm], axis = 1)
combined_df.shape

(26133, 1014)

# Similarity Recommender
# Nearest Neighbors

In [24]:
# Calculate Distance of TF-IDF Vectors
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Calculate Distance of TF-IDF Vectors
dist_matrix  = cosine_similarity(dtm)

In [26]:
# Turn it into a DataFrame
cosine_df = pd.DataFrame(dist_matrix)
print(cosine_df.shape)
cosine_df.head()

(26133, 26133)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26123,26124,26125,26126,26127,26128,26129,26130,26131,26132
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.30249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.30249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362343
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.218518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248431


In [29]:
# Verify it was added
leng = len(spotify_data)-1
spotify_data.iloc[leng]

artist_name                                     Panic! At The Disco
track_id                                     0mhHHf2dn1mVXACRpmN6Zb
track_name          Say Amen (Saturday Night) - Sweater Beats Remix
acousticness                                                0.00291
danceability                                                  0.549
duration_ms                                                  188571
energy                                                        0.713
instrumentalness                                             0.0016
key                                                               6
liveness                                                     0.0406
loudness                                                     -5.798
mode                                                              0
speechiness                                                   0.041
tempo                                                       167.961
time_signature                                  

In [30]:
# # Grab the top 5 most similar strains to the custom strain at the start.
last_cosine = len(cosine_df)-1
cosine_results = cosine_df[cosine_df[0] < 1][last_cosine].sort_values(ascending=False)[1:6]
cosine_results =  pd.DataFrame(cosine_results)
cosine_results = cosine_results.reset_index()
cos_results = cosine_results['index'].values.tolist()
cos_results

[1238, 17361, 11059, 15492, 13055]

In [32]:
# Check results
print('----------------------------')
print('----------------------------')
print(f"Seed song:") 
print(f"{spotify_data.iloc[leng]}")
print('----------------------------')
print('----------------------------')
print('Similar songs:')
print('----------------------------')
for each in cos_results:
  print(spotify_data.iloc[each])

----------------------------
----------------------------
Seed song:
artist_name                                     Panic! At The Disco
track_id                                     0mhHHf2dn1mVXACRpmN6Zb
track_name          Say Amen (Saturday Night) - Sweater Beats Remix
acousticness                                                0.00291
danceability                                                  0.549
duration_ms                                                  188571
energy                                                        0.713
instrumentalness                                             0.0016
key                                                               6
liveness                                                     0.0406
loudness                                                     -5.798
mode                                                              0
speechiness                                                   0.041
tempo                                          