In [94]:
from extracting_data import extract
from utilis import change_date, change_genre
import pandas as pd

def change_string(x):
    x = x.replace('{', '')
    x = x.replace('}', '')
    x = x.split(',')
    return x


def tranform_column(column):
    d = column.apply(change_string)
    list_of_dictionaries = []
    for row in d:
        data_dict = {}
        for item in row:
            item = item.replace("'", "").strip()
            key, value = item.split(':', 1)
            data_dict[key.strip()] = value.strip()
        list_of_dictionaries.append(data_dict)

    return list_of_dictionaries


def create_new_column(column, df):
    l = tranform_column(column)

    danceability = []
    energy = []
    loudness = []
    speechiness = []
    acousticness = []
    instrumentalness = []
    liveness = []
    valence = []
    tempo = []

    for dic in l:
        danceability.append(dic['danceability'])
        energy.append(dic['energy'])
        loudness.append(dic['loudness'])
        speechiness.append(dic['speechiness'])
        acousticness.append(dic['acousticness'])
        instrumentalness.append(dic['instrumentalness'])
        liveness.append(dic['liveness'])
        valence.append(dic['valence'])
        tempo.append(dic['tempo'])

    df['danceability'] = danceability
    df['energy'] = energy
    df['loudness'] = loudness
    df['speechiness'] = speechiness
    df['acousticness'] = acousticness
    df['instrumentalness'] = instrumentalness
    df['liveness'] = liveness
    df['valence'] = valence
    df['tempo'] = tempo

    df.drop(columns=['other'], inplace=True)

    return df

def get_playlist_data(file_name):
    #extract(link, f'{file_name}.csv')
    new_data = pd.read_csv(f'{file_name}.csv')
    new_data.dropna(inplace=True, subset=['other'])
    create_new_column(new_data['other'], new_data)
    new_data.drop([new_data.columns[0]], axis=1, inplace=True)
    new_data['year'] = new_data['release date'].apply(change_date)
    new_data.drop(columns='release date', inplace=True)
    new_data['track genre'].fillna('Inne', inplace=True)
    new_data['new genre'] = new_data['track genre'].apply(change_genre)
    new_data.drop(columns='track genre', inplace=True)
    encoded_data = pd.get_dummies(new_data, columns=['new genre'], drop_first=True, dtype=int)
    return encoded_data

In [95]:
playlist = get_playlist_data('test')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['track genre'].fillna('Inne', inplace=True)


In [121]:
playlist

Unnamed: 0,track,artist,album,artist pop,track pop,duration,danceability,energy,loudness,speechiness,...,valence,tempo,year,new genre_hip hop,new genre_indie,new genre_other,new genre_pop,new genre_rap,new genre_rock,new genre_soul
0,Beautiful Things,Benson Boone,Beautiful Things,84,95,180304,0.472,0.471,-5.692,0.0603,...,0.219,105.029,2024,0,0,0,1,0,0,0
1,Cruel Summer,Taylor Swift,Lover,100,94,178426,0.552,0.702,-5.707,0.157,...,0.564,169.994,2019,0,0,0,1,0,0,0
2,greedy,Tate McRae,greedy,81,93,131872,0.749,0.729,-3.19,0.0321,...,0.816,111.004,2023,0,0,0,1,0,0,0
3,Unwritten,Natasha Bedingfield,Unwritten,74,1,259333,0.706,0.8,-6.333,0.0399,...,0.629,100.011,2004,0,0,0,1,0,0,0
4,Stick Season,Noah Kahan,Stick Season,83,92,182346,0.664,0.5,-6.935,0.0651,...,0.801,117.896,2022,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Tongue Tied,GROUPLOVE,Never Trust a Happy Song,63,81,218013,0.56,0.936,-5.835,0.0439,...,0.371,112.96,2011,0,0,0,0,0,1,0
96,Last Night,Morgan Wallen,3 Songs At A Time Sampler,88,81,163854,0.492,0.673,-5.431,0.0347,...,0.488,203.812,2023,0,0,1,0,0,0,0
97,STAY (with Justin Bieber),The Kid LAROI,STAY (with Justin Bieber),76,81,141805,0.591,0.764,-5.484,0.0483,...,0.478,169.928,2021,1,0,0,0,0,0,0
98,Karma,Taylor Swift,Midnights,100,81,204852,0.64,0.619,-7.108,0.065,...,0.121,90.008,2022,0,0,0,1,0,0,0


In [96]:
playlist_num = playlist.drop(columns=['artist', 'album', 'track'])

In [117]:
from sklearn.preprocessing import MinMaxScaler

def scale_data(data, column_names):
    df = pd.DataFrame(data, columns=column_names)
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_data, columns=column_names)
    return scaled_df

In [118]:
scaled_playlist = scale_data(data=playlist_num, column_names=playlist_num.columns)

In [119]:
spotify_data = pd.read_csv('spotify_data_encoded2.csv')
spotify_data.drop([spotify_data.columns[0]], axis=1, inplace=True)
spotify_data_num = spotify_data.drop(columns=['track', 'artist', 'album'])
spotify_data_scaled = scale_data(spotify_data_num, spotify_data_num.columns)

# Liczenie średnich wartości

In [120]:
column_averages = scaled_playlist.mean()
averages_cosine_sim = pd.DataFrame([column_averages], index=['Average'])
averages_cosine_sim

Unnamed: 0,artist pop,track pop,duration,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,new genre_hip hop,new genre_indie,new genre_other,new genre_pop,new genre_rap,new genre_rock,new genre_soul
Average,0.474865,0.882766,0.405921,0.570971,0.593017,0.695442,0.101508,0.228562,0.043075,0.189823,0.564036,0.37681,0.77766,0.07,0.02,0.15,0.59,0.02,0.13,0.01


In [126]:
def synchronize_columns(df1, df2):
    # Znajdź kolumny, które są w df1, ale nie ma ich w df2
    missing_in_df2 = set(df1.columns) - set(df2.columns)
    # Znajdź kolumny, które są w df2, ale nie ma ich w df1
    missing_in_df1 = set(df2.columns) - set(df1.columns)
    
    # Dodaj brakujące kolumny do df2, wypełniając je zerami
    for col in missing_in_df2:
        df2[col] = 0
        
    # Dodaj brakujące kolumny do df1, wypełniając je zerami
    for col in missing_in_df1:
        df1[col] = 0
        
    # Upewnij się, że kolumny są w tej samej kolejności
    df1 = df1[sorted(df1.columns)]
    df2 = df2[sorted(df2.columns)]
    
    return df1, df2

In [130]:
synchronize_columns(spotify_data_scaled, averages_cosine_sim)
averages_cosine_sim = averages_cosine_sim.sort_index(axis=1)
spotify_data_scaled = spotify_data_scaled.sort_index(axis=1)

In [129]:
averages_cosine_sim

Unnamed: 0,artist pop,track pop,duration,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,new genre_other,new genre_pop,new genre_rap,new genre_rock,new genre_soul,new genre_dance,new genre_folk,new genre_classical,new genre_jazz,new genre_metal
Average,0.474865,0.882766,0.405921,0.570971,0.593017,0.695442,0.101508,0.228562,0.043075,0.189823,...,0.15,0.59,0.02,0.13,0.01,0,0,0,0,0


In [168]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(spotify_data_scaled, averages_cosine_sim)
 
spotify_data['similarity_score'] = similarity_scores
 
new_frame = pd.merge(spotify_data, playlist, how='left', on='track')
top_similarities_filtered = new_frame[new_frame.isna().any(axis=1)]
top_similarities_filtered = top_similarities_filtered.sort_values(by='similarity_score', ascending=False)
top_similarities_filtered = top_similarities_filtered.drop_duplicates(subset=['track', 'artist_x'])
top_similarities_filtered[['track', 'artist_x', 'similarity_score']].head(20)


Unnamed: 0,track,artist_x,similarity_score
1558,Rockabye (feat. Sean Paul & Anne-Marie),Clean Bandit,0.973744
68,TiK ToK,Kesha,0.970373
1161,I Took A Pill In Ibiza - Seeb Remix,Mike Posner,0.969824
67,Bang Bang,Jessie J,0.969438
1042,Stereo Hearts (feat. Adam Levine),Gym Class Heroes,0.968979
51,Classic,MKTO,0.968912
1153,There's Nothing Holdin' Me Back,Shawn Mendes,0.968827
38,"These Days (feat. Jess Glynne, Macklemore & Da...",Rudimental,0.968677
22,CAN'T STOP THE FEELING! (from DreamWorks Anima...,Justin Timberlake,0.968482
1515,Starships,Nicki Minaj,0.968181
