In [3]:
import numpy as np
import pandas as pd
## Pandas has a depedency on NumPy so loads automatically but best practice to load full NumPy package
print(f"Numpy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")

# visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.renderers.default = 'notebook'

# sklearn
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, f_regression
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
print(f"skLearn version: {sk.__version__}")

# scipy
import scipy
from scipy.stats import norm

# statsmodels
import statsmodels.api as sm

# other
import numbers
import requests
import json
from xgboost import XGBClassifier

# b3ta functions
import b3tafunc as b3


Numpy version: 1.24.3
pandas version: 2.0.3
skLearn version: 0.24.2


In [9]:

# full df - raw
#df = pd.read_csv('../data/05-b3-exports/df.csv', index_col=0)
#df = pd.read_csv('../data/03-b3-exports/03-b3_df.csv', index_col=0)
df50 = pd.read_csv('../data/06-b3-streamlit-git/06-b3_df50-sl.csv', index_col=0)
#df50['mode'] = np.where(df50['mode'] == 1, 'Major', 'Minor')
user_raw_df = pd.read_csv('../data/04-b3-exports/04-b3-user_raw_df.csv', index_col=0)

# create sound profile dfs
london_snd = df50[df50['sound_profile'] == 'london']
manchester_snd = df50[df50['sound_profile'] == 'manchester']
ibiza_snd = df50[df50['sound_profile'] == 'ibiza']
berlin_snd = df50[df50['sound_profile'] == 'berlin']
kingston_snd = df50[df50['sound_profile'] == 'kingston']
nyc_snd = df50[df50['sound_profile'] == 'nyc']
la_snd = df50[df50['sound_profile'] == 'la']
combined_snd = df50[df50['sound_profile'] != 'unknown']

# upsampled_rf
london_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/london_upsamp_test_rf-mergedv1.csv', index_col=0)
manchester_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/manchester_upsamp_test_rf-mergedv1.csv', index_col=0)
ibiza_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/ibiza_upsamp_test_rf-mergedv1.csv', index_col=0)
berlin_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/berlin_upsamp_test_rf-mergedv1.csv', index_col=0)
kingston_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/kingston_upsamp_test_rf-mergedv1.csv', index_col=0)
nyc_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/nyc_upsamp_test_rf-mergedv1.csv', index_col=0)
la_upsamp_test_rf = pd.read_csv('../data/04-b3-exports/la_upsamp_test_rf-mergedv1.csv', index_col=0)

# upsampled_cs
london_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/london_upsamp_test_cs-mergedv1.csv', index_col=0)
manchester_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/manchester_upsamp_test_cs-mergedv1.csv', index_col=0)
ibiza_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/ibiza_upsamp_test_cs-mergedv1.csv', index_col=0)
berlin_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/berlin_upsamp_test_cs-mergedv1.csv', index_col=0)
kingston_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/kingston_upsamp_test_cs-mergedv1.csv', index_col=0)
nyc_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/nyc_upsamp_test_cs-mergedv1.csv', index_col=0)
la_upsamp_test_cs = pd.read_csv('../data/04-b3-exports/la_upsamp_test_cs-mergedv1.csv', index_col=0)

In [10]:
def G3OCORRECT(playlist, algorithm, location):

    ### APPLY ALGORITHM
    if algorithm == 1:
        # IF LONDON
        if location.lower() == 'london':
            temp = london_snd.copy()
            temp.reset_index(drop=True, inplace=True)
            
        # IF MANCHESTER
        elif location.lower() == 'manchester':
            temp = manchester_snd.copy()
            temp.reset_index(drop=True, inplace=True)
    
        # IF IBIZA
        elif location.lower() == 'ibiza':
            temp = ibiza_snd.copy()
            temp.reset_index(drop=True, inplace=True)   

        # IF BERLIN
        elif location.lower() == 'berlin':
            temp = berlin_snd.copy()
            temp.reset_index(drop=True, inplace=True)    

        # IF KINGSTON
        elif location.lower() == 'kingston':
            temp = kingston_snd.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF NYC
        elif location.lower() == 'nyc':
            temp = nyc_snd.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF LA
        elif location.lower() == 'la':
            temp = la_snd.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF GLOBAL
        elif location.lower() == 'global':
            temp = df50.copy()
            temp.reset_index(drop=True, inplace=True)        

        # ELSE ENTER VALID LOCATION
        else:
            print("ENTER VALID LOCATION: Input one of the following 'London', 'Manchester', 'Ibiza', 'Berlin', 'Kingston', 'NYC', 'LA', 'Global'")
            
    elif algorithm == 2:
        # IF LONDON
        if location.lower() == 'london':
            temp = london_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF MANCHESTER
        elif location.lower() == 'manchester':
            temp = manchester_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)
    
        # IF IBIZA
        elif location.lower() == 'ibiza':
            temp = ibiza_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)   

        # IF BERLIN
        elif location.lower() == 'berlin':
            temp = berlin_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)    

        # IF KINGSTON
        elif location.lower() == 'kingston':
            temp = kingston_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF NYC
        elif location.lower() == 'nyc':
            temp = nyc_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF LA
        elif location.lower() == 'la':
            temp = la_upsamp_test_rf.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF GLOBAL
        elif location.lower() == 'global':
            temp = df50.copy()
            temp.reset_index(drop=True, inplace=True)        

        # ELSE ENTER VALID LOCATION
        else:
            print("ENTER VALID LOCATION: Input one of the following 'London', 'Manchester', 'Ibiza', 'Berlin', 'Kingston', 'NYC', 'LA', 'Global'")

    elif algorithm == 3:

        # IF LONDON
        if location.lower() == 'london':
            temp = london_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF MANCHESTER
        elif location.lower() == 'manchester':
            temp = manchester_upsamp_test_cs.copy()
            
    
        # IF IBIZA
        elif location.lower() == 'ibiza':
            temp = ibiza_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)   

        # IF BERLIN
        elif location.lower() == 'berlin':
            temp = berlin_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)    

        # IF KINGSTON
        elif location.lower() == 'kingston':
            temp = kingston_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF NYC
        elif location.lower() == 'nyc':
            temp = nyc_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF LA
        elif location.lower() == 'la':
            temp = la_upsamp_test_cs.copy()
            temp.reset_index(drop=True, inplace=True)

        # IF GLOBAL
        elif location.lower() == 'global':
            temp = df50.copy()
            temp.reset_index(drop=True, inplace=True)      

        # ELSE ENTER VALID LOCATION
        else:
            print("ENTER VALID LOCATION: Input one of the following 'London', 'Manchester', 'Ibiza', 'Berlin', 'Kingston', 'NYC', 'LA', 'Global'")

    else:
        print("ENTER A RECOMMENDATION ALGORITHM 1, 2 or 3")

    
    # ACCOUNT FOR DIFFERENT MODE COL TYPES
    df50.reset_index(drop=True, inplace=True)
    value = df50['mode'].loc[0]
    if isinstance(value, numbers.Number):
        df50['mode'] = np.where(df50['mode'] == 1, 'Major', 'Minor')

    playlist.reset_index(drop=True, inplace=True)
    value = playlist['mode'].loc[0]
    if isinstance(value, numbers.Number):
        playlist['mode'] = np.where(playlist['mode'] == 1, 'Major', 'Minor')

    temp.reset_index(drop=True, inplace=True)
    value = temp['mode'].loc[0]
    if isinstance(value, numbers.Number):
        temp['mode'] = np.where(temp['mode'] == 1, 'Major', 'Minor')
        
    ### PREP USER PLAYLIST
    # 1) Split target playlist into labels and numeric
    playlist_labels, playlist_num = b3.df_numcat(playlist)
    print(f"PLAYLIST SHAPE: {playlist.shape}")
    print(f"PLAYLIST NUM: {playlist_num.shape}")
    print(f"PLAYLIST LAB: {playlist_labels.shape}")

    # 2a) Scale target playlist numeric - import the scaler
    from sklearn.preprocessing import StandardScaler

    # 2b) Scale target playlist numeric - make a scaler
    scaler = StandardScaler()

    # 2c) Scale target playlist numeric - fit the scaler
    df_labels, df_num = b3.df_numcat(df50)
    scaler.fit(df_num)
    print(f"DF50 SHAPE: {df50.shape}")
    print(f"DF50 NUM: {df_num.shape}")
    print(f"DF50 LAB: {df_labels.shape}")

    # 2d) Scale target playlist numeric - transform the data. note we get back a numpy array even if we put in a dataframe
    playlist_num_scl = scaler.transform(playlist_num)

    # 2e) Scale target playlist numeric - convert to df and add back columns
    playlist_num_scl = pd.DataFrame(columns=playlist_num.columns, data=playlist_num_scl)


    ### GEOCORRECT - IF LOCATION VALID

    # 1) Split target playlist into lables and numeric
    temp_labels, temp_num = b3.df_numcat(temp)
    print(f"DF50 SHAPE: {temp.shape}")
    print(f"DF50 NUM: {temp_num.shape}")
    print(f"DF50 LAB: {temp_labels.shape}")

    # 2a) Scale target playlist numeric - import the scaler
    from sklearn.preprocessing import StandardScaler

    # 2b) Scale target playlist numeric - make a scaler
    scaler = StandardScaler()

    # 2c) Scale target playlist numeric - fit the scaler
    scaler.fit(df_num)

    # 2d) Scale target playlist numeric - transform the data. note we get back a numpy array even if we put in a dataframe
    temp_num_scl = scaler.transform(temp_num)

    # 2e) Scale target playlist numeric - convert to df and add back columns
    temp_num_scl = pd.DataFrame(columns=temp_num.columns, data=temp_num_scl)

    ### CREATE EMPTY GEO-CORRECTED DF
    geocorrected = pd.DataFrame()

    # COSINE COMPARISON
    for i in range(playlist_num_scl.shape[0]):

        #### Create temp cosine similarity df
        temp_csdf = pd.DataFrame(cosine_similarity(playlist_num_scl.loc[i,:].values.reshape(1, -1), temp_num_scl)).T

        #### Get index of top result
        # IF GLOBAL
        if location.lower() == 'global':
            temp_index = temp_csdf.sort_values(by=0, ascending=False).index[1]
            ### top 20 print
            top_20 = ['track_id', 'artist_name', 'track_name']
            print(temp.loc[temp_csdf.sort_values(by=0, ascending=False).index,:][top_20])
        else:
            temp_index = temp_csdf.sort_values(by=0, ascending=False).index[0]

        #### Grab track from temp df and add to geocorrected playlist
        geocorrected = pd.concat([geocorrected, temp.loc[[temp_index]]][0:10])

        #### Drop track from temp dfs so no replication in final playlist
        temp.drop(index=temp_index, inplace=True)
        temp_num_scl.drop(index=temp_index, inplace=True)
        temp.reset_index(drop=True, inplace=True)
        temp_num_scl.reset_index(drop=True, inplace=True) 

    return geocorrected

#DOCUMENTATION;
#- Takes a single DataFrame (raw, 17 columns) and a location as an argument.
#- Returns a geo-corrected playlist (e.g. playlist_geocorrected_for_london = df_geocorrect1(user_raw_df, 'london') )



In [11]:
raw_5 = user_raw_df.head()
raw_5.head(1)

Unnamed: 0,sound_profile,track_id,artist_name,track_name,genre,key,mode,duration_ms,tempo,loudness,energy,valence,danceability,speechiness,instrumentalness,acousticness,liveness
223,london,5O2Ft5ENCfAh6JCgykJs76,Sugababes,Round Round,pop,F#/Gb,0,236427,126.607,-3.802,0.845,0.749,0.74,0.0338,6e-06,0.00287,0.115


In [12]:
G3OCORRECT(raw_5, 1, 'london')

PLAYLIST SHAPE: (5, 17)
PLAYLIST NUM: (5, 10)
PLAYLIST LAB: (5, 7)
DF50 SHAPE: (565838, 17)
DF50 NUM: (565838, 10)
DF50 LAB: (565838, 7)
DF50 SHAPE: (301, 17)
DF50 NUM: (301, 10)
DF50 LAB: (301, 7)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,sound_profile,track_id,artist_name,track_name,genre,key,mode,duration_ms,tempo,loudness,energy,valence,danceability,speechiness,instrumentalness,acousticness,liveness
224,london,5O2Ft5ENCfAh6JCgykJs76,Sugababes,Round Round,pop,F#/Gb,Minor,236427,126.607,-3.802,0.845,0.749,0.74,0.0338,6e-06,0.00287,0.115
136,london,3G0EALIIp5DAeIERxXBHmo,The Kinks,Waterloo Sunset,rock,C#/Db,Minor,194520,107.704,-7.377,0.692,0.432,0.521,0.0251,0.0,0.0976,0.187
265,london,0Ja4hLKiUSw01E01pJ1yGr,The Prodigy,Breathe,trip-hop,A,Major,336280,130.041,-6.183,0.808,0.303,0.673,0.0491,0.878,0.0121,0.037
36,london,6L4fEE9awoUnSEy1bmQebb,Blur,Country House - 2012 Remaster,alt-rock,A,Major,236960,175.293,-6.606,0.885,0.589,0.331,0.0923,0.0,0.205,0.123
242,london,6WeEgi4zkGaHuFXZOWXX3m,Dirty Pretty Things,Deadwood,indie,C,Major,147560,109.285,-2.152,0.96,0.471,0.479,0.0816,0.0,0.00888,0.369


In [None]:
  # ACCOUNT FOR DIFFERENT MODE COL TYPES
    df50_modecat = df50.copy()
    value = df50_modecat['mode'].loc[0]
    if isinstance(value, numbers.Number):
        df50_modecat['mode'] = np.where(df50_modecat['mode'] == 1, 'Major', 'Minor')

    selected_tracks_df_modecat = selected_tracks_df.copy()
    value2 = selected_tracks_df_modecat['mode'].loc[0]
    if isinstance(value2, numbers.Number):
        selected_tracks_df_modecat['mode'] = np.where(selected_tracks_df_modecat['mode'] == 1, 'Major', 'Minor')

    g3ocorrected_modecat = g3ocorrected.copy()
    value3 = g3ocorrected_modecat['mode'].loc[0]
    if isinstance(value3, numbers.Number):
        g3ocorrected_modecat['mode'] = np.where(g3ocorrected_modecat['mode'] == 1, 'Major', 'Minor')

    
    # Get raw nums
    dflab, dfnum = b3.df_numcat(df50)
    user_dflab, user_dfnum = b3.df_numcat(selected_tracks_df)
    g3o_dflab, g3o_dfnum = b3.df_numcat(g3ocorrected)

    # Scale
    # 1. Import the scaler
    from sklearn.preprocessing import StandardScaler 
    # 2. make a scaler
    scaler = StandardScaler()
    # 3. fit the scaler
    scaler.fit(dfnum)
    # 4. transform the data. note we get back a numpy array even if we put in a dataframe
    user_dfnum_scl = scaler.transform(user_dfnum)
    g3o_dfnum_scl = scaler.transform(g3o_dfnum)

    # Add back column titles
    user_dfnum_scl = pd.DataFrame(columns=user_dfnum.columns, data=user_dfnum_scl)
    g3o_dfnum_scl = pd.DataFrame(columns=g3o_dfnum.columns, data=g3o_dfnum_scl)

    # Calculate archetypes
    user_arch_raw = user_dfnum_scl.mean()
    g3o_arch_raw = g3o_dfnum_scl.mean()

    import plotly.graph_objects as go

    # 10 metrics, 9 layers

    # Get the index values as categories
    categories = user_arch_raw.index.tolist()

    # Create traces for each dataset (9 layers)
    trace_user = go.Scatterpolar(r=user_arch_raw.values, theta=categories, fill='toself', name='user', line=dict(color='#28FCC8'))
    trace_g3o = go.Scatterpolar(r=g3o_arch_raw.values, theta=categories, fill='toself', name='g3ocorrected', line=dict(color='#28fc64'))

    # Create figure and add traces
    fig = go.Figure()
    fig.add_trace(trace_user)
    fig.add_trace(trace_g3o)

    # Update layout
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                #range=[0, 1]  # Adjust the range based on your data
            )),
        showlegend=True,
        width=600,  # Set the width of the plot
        height=600,  # Set the height of the plot
        paper_bgcolor='rgba(0,0,0,0)',  # Set the background color to fully transparent
        polar_bgcolor='rgba(0,0,0,0)',  # Set the polar background color to fully transparent
        polar_radialaxis=dict(
            visible=True,
            tickfont=dict(color='lightgrey'),  # Set tick font color to light grey
            gridcolor='grey'  # Set grid color to light grey
        ),
        polar_angularaxis=dict(
            visible=True,
            gridcolor='grey',
            tickfont=dict(color='grey')  # Set tick font color to light grey
        ),
        legend=dict(
            orientation='h',  # Horizontal orientation
            x=0.5,  # Center horizontally
            y=-0.15,  # Place the legend below the plot
            bgcolor='rgba(0,0,0,0)',  # Set the legend background color to fully transparent
            yanchor="top",
            xanchor="center"
        )
    )

    # Plot!
    st.plotly_chart(fig, use_container_width=True)
