#### Compute Similarity
The goal of this notebook is to compute song similarity between a sample of all songs and create necessary files to create a graph in neo4j.

In [1]:
# import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import song data
song_df = pd.read_csv('spotify.csv')
song_df.drop('Unnamed: 0', inplace=True, axis=1)
song_df.head(3)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


In [3]:
def row_similarity(row1: pd.Series, row2: pd.Series) -> float:
    """
        Computes similarity between two songs by computing the cosine similarity on
        columns of interest.

        Args:
            row1 (Series): song 1 to compare similarity
            row2 (Series): song 2 to compare similarity
        
        Returns:
        sim_score (float): similarity between two songs
    """ 
    # compute values that represent if the songs have the same artists, album_name, or track_genre
    same_artists = 1 if row1['artists'] == row2['artists'] else 0
    same_album_name = 1 if row1['album_name'] == row2['album_name'] else 0
    same_track_genre = 1 if row1['track_genre'] == row2['track_genre'] else 0

    # compute cosine similarity between two songs
    sim_score = cosine_similarity([[same_artists, same_album_name, same_track_genre,
            row1['popularity'], row1['energy'], row1['loudness'], row1['acousticness'],
            row1['instrumentalness'], row1['liveness'], row1['valence'], row1['tempo']]],
            [[same_artists, same_album_name, same_track_genre,
            row2['popularity'], row2['energy'], row1['loudness'], row1['acousticness'],
            row2['instrumentalness'], row2['liveness'], row2['valence'], row2['tempo']]])[0][0]

    return sim_score

In [4]:
def compare_all_songs(df: pd.DataFrame) -> pd.DataFrame:
    """
        Computes similarity between every pair of unique songs in the given DataFrame.

        Args:
            df (DataFrame): dataframe of songs
        
        Returns:
        sim_df (DataFrame): dataframe of unique song pairs and their similarity score
    """ 
    # create unique pairs of song rows
    unique_pairs = [(i, row1, j, row2) for i, row1 in df.iterrows() for j, row2 in df.iterrows()if i<j]

    # create a list of similarity scores for each unique pair
    similarity_scores = [row_similarity(row1, row2) for _, row1, _, row2 in unique_pairs]

    # get track ids and similarity scores for each unique pair
    track_ids = [(row1['track_id'], row2['track_id']) for _, row1, _, row2 in unique_pairs]
    sim_scores = similarity_scores  

    # create a dataframe of similairty scores for each unique pair
    sim_df = pd.DataFrame({'track_id1': [t[0] for t in track_ids],
                        'track_id2': [t[1] for t in track_ids],
                        'sim_score': sim_scores})
    return sim_df

In [7]:
# take a random sample of all songs with a total of 100 songs
sample_df = song_df.sample(n=100, replace=False)
# get all Strokes songs available 
strokes_df = song_df[song_df['artists']=='The Strokes']
# add all Strokes songs to the sample
sample_df = pd.concat([sample_df, strokes_df])
# drop duplicate rows from the sample
sample_df= sample_df.drop_duplicates().reset_index(drop=True)
sample_df.head(3)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,3D8pyI6CPeOAOZmkavoei0,Blood Duster,Cunt,We Are the Word Police,12,74426,True,0.458,0.951,11,-9.676,0,0.144,0.00242,0.871,0.549,0.203,110.505,4,grindcore
1,18Q6eRrnN2AGuE3XiprbNg,Mini Pop Kids,Mini Pop Kids 18,Savage Love,41,171493,False,0.742,0.598,3,-6.534,1,0.0596,0.326,0.0,0.093,0.679,112.537,3,kids
2,3FQCJI2t5LTbsRPfYVBSVB,Linkin Park,One More Light,Battle Symphony,64,216281,False,0.653,0.839,10,-5.077,1,0.0685,0.0369,0.0,0.068,0.669,149.034,4,metal


In [9]:
# get similarity scores for each row in the sample
sim_df = compare_all_songs(sample_df)
sim_df.head(3)

Unnamed: 0,track_id1,track_id2,sim_score
0,3D8pyI6CPeOAOZmkavoei0,18Q6eRrnN2AGuE3XiprbNg,0.971183
1,3D8pyI6CPeOAOZmkavoei0,3FQCJI2t5LTbsRPfYVBSVB,0.955914
2,3D8pyI6CPeOAOZmkavoei0,7dwSZfYZgdwiYO3s4VBU61,0.763505


In [12]:
# save these scores to a csv
sim_df.to_csv('sample_song_similarity.csv', index=False)

In [13]:
# save songs in the sample to a csv
sample_df.to_csv('sample_songs.csv', index=False)