### Import Statements

In [1]:
#import statements
import pandas as pd

# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


### Convert CSV file to Dataframe

In [2]:
#convert CSV to Dataframe

In [3]:
songs_df = pd.read_csv ('data.csv')
print(songs_df.head())

                         artist          title  \
0                      Lil Nas,  Old Town Road   
1  Shawn Mendes, Camila Cabello       Senorita   
2                 Billie Eilish        Bad Guy   
3                        Khalid           Talk   
4     Ed Sheeran, Justin Bieber   I Don't Care   

                                               genre  \
0  Country,Atlanta,Alternative Country,Hip-Hop,Tr...   
1                                                Pop   
2  Hip-Hop,Dark Pop,House,Trap,Memes,Alternative ...   
3                                      Synth-Pop,Pop   
4                      Canada,UK,Dance,Dance-Pop,Pop   

                                              lyrics                      id  \
0  Old Town Road Remix \nOh, oh-oh\nOh\nYeah, I'm...  2YpeDb67231RjR0MgVLzsG   
1  Senorita \nI love it when you call me senorita...  0TK2YIli7K1leLovkQiNik   
2  bad guy \nWhite shirt now red, my bloody nose\...  2Fxmhks0bxGSBdJ92vM42m   
3  Talk \nCan we just talk? Can we just ta

### Analyse the Data

#### This dataset contains 124272 songs ,each having  fields artist name, title, genre, id, danceability, energer,key, loudness,mode, speechiness, acousticness, instrumentalness , liveness, valence, tempo, duration_ms, time_signature

In [10]:
#TF-IDF
songs_df.size

124272

#### Dropping the column time_signature since it is not used in building the models 

In [19]:
del songs_df['time_signature']


#### Create a playlist by randomly selecting songs from Dataset


In [20]:
playlist_df = songs_df.sample(n=3)


In [21]:
playlist_df

Unnamed: 0,artist,title,genre,lyrics,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
3746,Alpha Rev,New Morning,"Adult Alternative,Indie Rock,Rock",New Morning \nI don't give a damn\n'Bout the c...,7aOH8ShpP7P64x9fyTHxyc,0.191,0.682,5,-7.188,1,0.0413,0.05,5e-06,0.156,0.407,178.071,224827
6025,Justin Timberlake,Like I Love You,"Rap,R&;B,Pop",Like I Love You \nJust something bout you...\n...,6W2Ef5Ph6ILTUAedoQ3QIv,0.853,0.811,6,-4.926,0,0.0646,0.0439,0.000304,0.0703,0.901,114.961,283627
4975,Field Mob,So What,"R&;B,Rap","So What \nLadies and GENTLEMEN!\nJazze Pha, Fi...",0Uc706myy6Th7I6KQ9xA1x,0.931,0.486,6,-10.558,1,0.139,0.0579,0.0,0.161,0.896,120.026,216000


In [None]:
# list of indexes for songs in the playlist created of 
marks_list = df['Marks'].tolist()

In [9]:
playlist_df['lyrics'][1661]



"Focus \nHey\nI know what I came to do\nAnd that ain't gonna change\nSo go ahead and talk your talk\n'Cause I won't take the bait\nI'm over here doing what I like\nI'm over here working day and night\nAnd if my real ain't real enough\nI'm sorry for you, bae\nLet's find a light inside our universe now\nWhere ain't nobody keep on holding us down\nJust come and get it, let them say what they say\n'Cause I'm about to put them all away, woo\nFocus on me, f-f-focus on me, woo\nFocus on me, f-f-focus on me, woo\nFocus on me Focus, f-f-focus on me, woo Focus on me\nFocus on me Focus, f-f-focus on me, woo Focus on me\nI can tell you're curious\nIt's written on your lips\nAin't no need to hold it back\nGo head and talk your shit Hey\nI know you're hoping that I'll react\nI know you're hoping I'm looking back\nBut if my real ain't real enough\nThen I don't know what is Hey, yeah\nLet's find a light inside our universe now Mmm, yeah\nWhere ain't nobody keep on holding us down Nobody keep on holdin

In [11]:
songs = songs_df.reset_index(drop=True)

# Replace \n present in the text
songs['text'] = songs['lyrics'].str.replace(r'\n', '')

# Initialize tfidf vectorizer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

# Fit and transform 
tfidf_matrix = tfidf.fit_transform(songs['text'])

In [12]:
tfidf_matrix

<6904x159125 sparse matrix of type '<class 'numpy.float64'>'
	with 702069 stored elements in Compressed Sparse Row format>

### Building Model using Cosine Similarity

In [13]:
cosine_similarities = cosine_similarity(tfidf_matrix)


In [14]:
len(cosine_similarities)

6904

In [15]:
similarities = {}

for i in range(len(cosine_similarities)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the songs. 
    similar_indices = cosine_similarities[i].argsort()[:-50:-1] 
    # After that, we'll store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities[songs['title'].iloc[i]] = [(cosine_similarities[i][x], songs['title'][x], songs['artist'][x]) for x in similar_indices][1:]

In [16]:
similarities

{'Old Town Road': [(0.23378393514335316, 'Save A Horse', 'Big, Rich'),
  (0.18792975593858896, 'Hanginaround', 'Counting Crows'),
  (0.18145816968554287, "Ain't Worried About Nothin", 'French Montana'),
  (0.16119984055718384, 'Wanksta', '50 Cent'),
  (0.16050261350428852, "Lettin' The Night Roll", 'Justin Moore'),
  (0.15591842340550816, "Like I'm Gonna Lose You", 'Meghan Trainor'),
  (0.14660165600362907, "This Ain't Nothin'", 'Craig Morgan'),
  (0.14547709560661237, 'Small Town Boy', 'Dustin Lynch'),
  (0.14326984280042177, "I'm Not Gonna Miss You", 'Glen Campbell'),
  (0.1419893901481013, 'How You Gonna Act Like That', 'Tyrese'),
  (0.13968663088395977,
   "Cruisin' For A Bruisin'",
   'Ross Lynch, Grace Phipps And Jason Evigan'),
  (0.13859286160756018, 'Slow Jamz', 'Twista'),
  (0.13468919608871624,
   "Everything's Gonna Be Alright",
   'David Lee Murphy, Kenny Chesney'),
  (0.1328005799694785, 'Jingle Bells', 'Frank Sinatra'),
  (0.13133233107119566, 'Black Widow', 'Iggy Azalea

In [34]:

class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)
        
        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {round(recom_song[i][0], 3)} similarity score") 
            print("--------------------")
        
    def recommend(self, recommendation):
        # Get song to find recommendations for
        song = recommendation['song']
        # Get number of songs to recommend
        number_songs = recommendation['number_songs']
        # Get the number of songs most similars from matrix similarities
        recom_song = self.matrix_similar[song][:number_songs]
        # print each item
        self._print_message(song=song, recom_song=recom_song)

In [36]:
recommedations = ContentBasedRecommender(similarities)
recommendation = {
    "song": songs['title'].iloc[10],
    "number_songs": 4 
}
recommedations.recommend(recommendation)


The 4 recommended songs for Truth Hurts are:
Number 1:
Bom Bidi Bom by Nick Jonas, Nicki Minaj with 0.255 similarity score
--------------------
Number 2:
Endless Love by Glee Cast with 0.245 similarity score
--------------------
Number 3:
I'm Out by Ciara with 0.198 similarity score
--------------------
Number 4:
One Of Us by Glee Cast with 0.168 similarity score
--------------------
