#### Compute Similarity
The goal of this notebook is to compute song similarity between a sample of all songs and create necessary files to create a graph in neo4j.

In [2]:
# import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations


In [3]:
# import song data
song_df = pd.read_csv('spotify.csv')
song_df.drop('Unnamed: 0', inplace=True, axis=1)
song_df.head()


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
#TODO should we normalize?
def normalize(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Normalize specified columns in the DataFrame using Min-Max scaling.

    Args:
        df (DataFrame): DataFrame containing the data to be normalized
        columns (list): List of column names to be normalized

    Returns:
        normalized_df (DataFrame): DataFrame with specified columns normalized
    """
    normalized_df = df.copy()
    for column in columns:
        # Compute min and max values for the column
        min_val = df[column].min()
        max_val = df[column].max()
        # Perform Min-Max scaling
        normalized_df[column] = (df[column] - min_val) / (max_val - min_val)
    return normalized_df

columns_to_normalize = ['popularity', 'energy', 'loudness', 'acousticness',
                        'instrumentalness', 'liveness', 'valence', 'tempo']
norm_df = normalize(song_df, columns_to_normalize)
norm_df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,0.73,230666,False,0.676,0.461,1,0.791392,0,0.143,0.032329,1e-06,0.358,0.718593,0.361245,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0.55,149610,False,0.42,0.166,1,0.597377,1,0.0763,0.927711,6e-06,0.101,0.268342,0.318397,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0.57,210826,False,0.438,0.359,0,0.736123,1,0.0557,0.210843,0.0,0.117,0.120603,0.313643,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,0.71,201933,False,0.266,0.0596,0,0.573701,1,0.0363,0.908635,7.1e-05,0.132,0.143719,0.746758,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,0.82,198853,False,0.618,0.443,2,0.737103,1,0.0526,0.470884,0.0,0.0829,0.167839,0.492863,4,acoustic


In [5]:
def row_similarity(row1: pd.Series, row2: pd.Series) -> float:
    """
        Computes similarity between two songs by computing the cosine similarity on
        columns of interest.

        Args:
            row1 (Series): song 1 to compare similarity
            row2 (Series): song 2 to compare similarity
        
        Returns:
        sim_score (float): similarity between two songs
    """ 
    # compute values that represent if the songs have the same artists, album_name, or track_genre
    same_artists = 1 if row1['artists'] == row2['artists'] else 0
    same_album_name = 1 if row1['album_name'] == row2['album_name'] else 0
    same_track_genre = 1 if row1['track_genre'] == row2['track_genre'] else 0

    # compute cosine similarity between two songs
    sim_score = cosine_similarity([[same_artists, same_album_name, same_track_genre,
            row1['popularity'], row1['energy'], row1['loudness'], row1['acousticness'],
            row1['instrumentalness'], row1['liveness'], row1['valence'], row1['tempo']]],
            [[same_artists, same_album_name, same_track_genre,
            row2['popularity'], row2['energy'], row1['loudness'], row1['acousticness'],
            row2['instrumentalness'], row2['liveness'], row2['valence'], row2['tempo']]])[0][0]

    return sim_score

In [6]:
# def compare_all_songs(df: pd.DataFrame) -> pd.DataFrame:
#     """
#         Computes similarity between every pair of unique songs in the given DataFrame.
# 
#         Args:
#             df (DataFrame): dataframe of songs
#         
#         Returns:
#         sim_df (DataFrame): dataframe of unique song pairs and their similarity score
#     """ 
#     # create unique pairs of song rows
#     unique_pairs = [(i, row1, j, row2) for i, row1 in df.iterrows() for j, row2 in df.iterrows()if i<j]
# 
#     # create a list of similarity scores for each unique pair
#     similarity_scores = [row_similarity(row1, row2) for _, row1, _, row2 in unique_pairs]
# 
#     # get track ids and similarity scores for each unique pair
#     track_ids = [(row1['track_id'], row2['track_id']) for _, row1, _, row2 in unique_pairs]
#     sim_scores = similarity_scores  
# 
#     # create a dataframe of similairty scores for each unique pair
#     sim_df = pd.DataFrame({'track_id1': [t[0] for t in track_ids],
#                         'track_id2': [t[1] for t in track_ids],
#                         'sim_score': sim_scores})
#     return sim_df


#TODO I edited yours by adding combinations from itertools to prevent redundant comparisons (like compars song a to b and b to a)
def compare_all_songs(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes similarity between every pair of unique songs in the given DataFrame.

    Args:
        df (DataFrame): DataFrame of songs

    Returns:
        sim_df (DataFrame): DataFrame of unique song pairs and their similarity score
    """
    # Initialize lists to store track IDs and similarity scores
    track_ids = []
    sim_scores = []

    # Iterate over unique pairs of song indices
    for idx1, idx2 in combinations(df.index, 2):
        row1, row2 = df.loc[idx1], df.loc[idx2]
        
        # Compute similarity between the pair
        similarity_score = row_similarity(row1, row2)

        # Store track IDs and similarity score
        track_ids.append((row1['track_id'], row2['track_id']))
        sim_scores.append(similarity_score)

    # Create DataFrame from similarity data
    sim_df = pd.DataFrame({'track_id1': [t[0] for t in track_ids],
                           'track_id2': [t[1] for t in track_ids],
                           'sim_score': sim_scores})
    
    return sim_df

In [7]:
# take a random sample of all songs with a total of 100 songs
sample_df = song_df.sample(n=100, replace=False)
# get all Strokes songs available 
strokes_df = song_df[song_df['artists']=='The Strokes']
# add all Strokes songs to the sample
sample_df = pd.concat([sample_df, strokes_df])
# drop duplicate rows from the sample
sample_df= sample_df.drop_duplicates().reset_index(drop=True)
sample_df.head(3)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,78ivvTBqkt3e6eBBkWMuV8,Wood & Wire,No Matter Where It Goes from Here,John,36,175466,False,0.388,0.47,5,-10.011,1,0.0377,0.584,2e-06,0.176,0.776,183.893,3,bluegrass
1,2RFt6ZWQbr9mPhsft9u9eX,Gabrielle Aplin;JP Cooper,Dear Happy,Losing Me,61,181760,False,0.66,0.407,7,-8.381,1,0.0571,0.236,0.0,0.109,0.257,133.86,4,acoustic
2,3ktgwCr0hZ1EbC99oHbgN8,Fejo,Ne Marenda,Ne Marenda,27,178387,False,0.565,0.654,5,-6.742,1,0.147,0.179,9.6e-05,0.0984,0.602,179.988,4,malay
3,4wzjNqjKAKDU82e8uMhzmr,The Red Jumpsuit Apparatus,Don't You Fake It,Face Down,76,192000,False,0.545,0.932,7,-2.189,0,0.0399,0.000665,0.0,0.127,0.464,92.956,4,punk
4,0INqAT51XyRjjH9WVMkwA5,Marc Anthony,Mended,I Need You,36,251146,False,0.536,0.72,0,-5.879,1,0.0508,0.0234,1e-06,0.537,0.395,90.526,4,salsa
5,6TWcSqCntZhH5vyc35kkob,Cigarettes After Sex,Cigarettes After Sex,John Wayne,65,258483,False,0.441,0.465,3,-8.389,1,0.0291,0.343,0.884,0.0982,0.173,123.933,4,indie-pop
6,1KG8SoycbII7iW3jVDCnE0,Ignacio Corsini,El Caballero Cantor del Tango,Betinotti,15,166320,False,0.542,0.332,7,-6.409,1,0.0606,0.98,0.000708,0.136,0.546,83.173,4,tango
7,0Ant6wBzTsOcH63OgzmaXU,Lionel Richie,Artsy & Colourful,Angel - Metro Mix Radio Edit,0,223040,False,0.725,0.822,9,-10.059,1,0.0402,0.011,0.0,0.0938,0.581,127.971,4,disco
8,2mLtn7zqcqalNZSfQZ62ko,Geraldo Azevedo,Raízes e Frutos,Táxi Lunar,40,231066,False,0.655,0.47,4,-11.395,0,0.0356,0.237,0.00163,0.128,0.577,145.703,4,mpb
9,1lAVCaEi47Fy5zwyxndYn4,Zeca Pagodinho,Ser Humano,Ser Humano,47,201880,False,0.671,0.689,5,-7.448,0,0.0585,0.504,0.0,0.177,0.834,155.973,4,pagode


In [ ]:
# get similarity scores for each row in the sample
sim_df = compare_all_songs(sample_df)
sim_df.head(3)

In [9]:
# save these scores to a csv
# relationship csv
sim_df.to_csv('sample_song_similarity.csv', index=False)

In [10]:
# save songs in the sample to a csv
# nodes csv
sample_df.to_csv('sample_songs.csv', index=False)