In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
### checking the total of null values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
### Droping link column
df = df.drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(5)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [9]:
### Lyrics of the first song
df['text'][0]

"Look at her face, it's a wonderful face  \r\nAnd it means something special to me  \r\nLook at the way that she smiles when she sees me  \r\nHow lucky can one fellow be?  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?  \r\n  \r\nAnd when we go for a walk in the park  \r\nAnd she holds me and squeezes my hand  \r\nWe'll go on walking for hours and talking  \r\nAbout all the things that we plan  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?\r\n\r\n"

In [10]:
### Taking only 10,000 samples
df = df.sample(10000)

In [11]:
df.shape

(10000, 3)

### text cleaning / text preprocessing

In [12]:
### 1. changint the text into lower case, and then replacing special characters, numbers, and newline by white space
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)

In [13]:
df['text']

29748    what do you want, baby  \r one last piece of m...
47882    ho! the band stopped playin'  \r an' the coupl...
17125    god gave me back tomorrow  \r words by ray bol...
17696    gotta keep it on the down low  \r i'll never l...
35201    take these chains from my heart and set me fre...
                               ...                        
49389    i wreck your bed  \r the colour is honey  \r y...
5623     out of my depth  \r lost in the air  \r fallin...
25124    now i have nothing, so god give me strength,  ...
12243    knowing love the way i do  \r i can say for ce...
52554    girls don't go crazy  \r   \r woman, hold your...
Name: text, Length: 10000, dtype: object

In [14]:
### 2. then we need to stem the text 
import nltk
from nltk.stem.porter import PorterStemmer

In [15]:
stemmer = PorterStemmer()

In [16]:
### Then we need to convert the words into tokens
def token(text):
    token = nltk.word_tokenize(text)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [25]:
### nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
### example
token("you are so beautiful, beauty")

'you are so beauti , beauti'

In [20]:
### Applying the token function to the text column
df['text'].apply(lambda x: token(x))

29748    what do you want , babi one last piec of my he...
47882    ho ! the band stop playin ' an ' the coupl sto...
17125    god gave me back tomorrow word by ray boltz , ...
17696    got ta keep it on the down low i 'll never let...
35201    take these chain from my heart and set me free...
                               ...                        
49389    i wreck your bed the colour is honey your fing...
5623     out of my depth lost in the air fall faster li...
25124    now i have noth , so god give me strength , 'c...
12243    know love the way i do i can say for certain t...
52554    girl do n't go crazi woman , hold your man tig...
Name: text, Length: 10000, dtype: object

In [19]:
### 3. then we need to vectorize the words
### for this task, we use TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### to calculate the distance between data points we use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity


In [22]:
### Using TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [23]:
tfidf.fit_transform(df['text'])

<10000x34779 sparse matrix of type '<class 'numpy.float64'>'
	with 538613 stored elements in Compressed Sparse Row format>

In [24]:
matrix  = tfidf.fit_transform(df['text'])

In [25]:
cosine_similarity(matrix)

array([[1.        , 0.02494892, 0.01744261, ..., 0.03900175, 0.10182097,
        0.07714549],
       [0.02494892, 1.        , 0.04437611, ..., 0.00908212, 0.0679513 ,
        0.02489329],
       [0.01744261, 0.04437611, 1.        , ..., 0.10436375, 0.06675931,
        0.01480936],
       ...,
       [0.03900175, 0.00908212, 0.10436375, ..., 1.        , 0.09095438,
        0.01749139],
       [0.10182097, 0.0679513 , 0.06675931, ..., 0.09095438, 1.        ,
        0.07654491],
       [0.07714549, 0.02489329, 0.01480936, ..., 0.01749139, 0.07654491,
        1.        ]])

In [26]:
similar = cosine_similarity(matrix)

In [27]:
similar[0]

array([1.        , 0.02494892, 0.01744261, ..., 0.03900175, 0.10182097,
       0.07714549])

In [29]:
df[df['song'] == 'Good Old Days']

Unnamed: 0,artist,song,text
48638,P!nk,Good Old Days,"1, 2, 3, 4, 5 years go by, \r i don't really ..."


In [32]:
df[df['song'] == 'Good Old Days'].index[0]

48638