In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
pip install dash

In [None]:
from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

Data Cleaning / Manipulation

In [None]:
data = pd.read_csv('spotify-2023.csv', encoding='latin1')

data.head()

In [None]:
data.info()

In [None]:
data['streams']=data['streams'].str.replace(r'[^0-9]', '', regex=True).astype(int)

In [None]:
data.in_deezer_playlists=data.in_deezer_playlists.str.replace(',', '').astype(int)

In [None]:
data.in_shazam_charts=data.in_shazam_charts.str.replace(',', '')

In [None]:
data['in_shazam_charts'] = data['in_shazam_charts'].fillna(round(data['in_shazam_charts'].dropna().astype(int).mean())).astype(int)

In [None]:
data.isna().sum()

In [None]:
data['key']=data['key'].fillna('No Record')

In [None]:
data['key'].unique()

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='track_name').sum()

In [None]:
data.duplicated(subset='track_name').sum()

In [None]:
data.drop_duplicates(subset='track_name', inplace=True)

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
data.describe()

In [None]:
datetime_series = pd.to_datetime(data[['released_year', 'released_month', 'released_day']]\
                                 .rename(columns={'released_year': 'year', 'released_month': 'month', 'released_day': 'day'}))
data['release_date'] = datetime_series
data.drop(['released_year', 'released_month', 'released_day'], axis=1, inplace=True)

In [None]:
data['Year'] = data['release_date'].dt.year

EDA / General Visualization with plotly/dash

In [None]:
#Histograms

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype == 'int'],
                 value='in_spotify_playlists'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):

  fig = px.histogram(
      data,
      x=column,
      nbins=30
  ).update_layout(width=800)

  return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9012)

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype == 'int'],
                 value='in_spotify_playlists'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
  fig = px.box(
      data,
      y=column,
  ).update_layout(width=800)

  return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9013)

In [None]:
#Artists(Top 20)/ mode/ key distribution

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=[{'label': x.replace('_', ' ').title(), 'value':x} for x in ['artist(s)_name', 'mode', 'key', 'Year']],
                 value='artist(s)_name'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
  df = data[column].value_counts()[:20].reset_index()

  fig = px.pie(
      df,
      names='index',
      values=column,
      title=f"{column.replace('_', ' ').title()} in Data Distribution",
  ).update_traces(textposition='inside', textinfo='percent+label')

  return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9014)

In [None]:
df = data['artist(s)_name'].value_counts()[:20].reset_index()

px.bar(
    df,
    x='index',
    y='artist(s)_name',
    color='index',
    title='Top 20 Artist with Most Songs',
).update_layout(width=1000,height=500)\
.add_hline(
          y=df['artist(s)_name'].mean(),
          line_dash='dash',
          line_color='grey')

In [None]:
#Top 20 Song from Spotify with highest {column}
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype != 'O'],
                 value='streams'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
  df = data.sort_values(column, ascending=False)[:20]

  fig = px.bar(
      df,
      y='track_name',
      x=column,
      title=f"Top 20 Songs from Spotify with Highest {column.replace('_', ' ').title()}",
      color='track_name',
      text_auto=True
  ).update_layout(width=950,height=650).update_traces(showlegend=False)

  return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9015)

In [None]:
data.loc[data['track_name'] == 'Love Grows (Where My Rosemary Goes)']

In [None]:
px.bar(
    data.drop(571, axis=0).sort_values('streams', ascending=False)[:20],
    y='track_name',
    x='streams',
    title="Top 20 Song from Spotify with Highest Streams",
    color='track_name',
    text_auto=True
    ).update_layout(width=1000,height=700).update_traces(showlegend=False)

Data Analysis

In [None]:
platforms = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'streams']

In [None]:
#Top 20 artists of highest {column}

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=platforms,
                 value='in_spotify_playlists'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(column):
    df = data.groupby('artist(s)_name', as_index=False).sum().sort_values(column, ascending=False)[:20]
    
    fig = px.bar(
        df, 
        x='artist(s)_name',
        y=column,
        color='artist(s)_name',
        labels={'artist(s)_name':'Artists'},
        title=f"Top 20 Artists with Highest Presence {column.replace('_', ' ').title()}",
#         text_auto=True
    ).update_layout(width=1000, height=650, xaxis=dict(tickangle=45))
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9017)

In [None]:
pd.DataFrame(data.groupby(['artist(s)_name', data.release_date.dt.year, 'track_name'], as_index=False)\
                  .mean().sort_values('Year', ascending=False))

In [None]:
#Top 20 Song from Release Year of {year} with Most Streams

app  = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Year Selection:'),
    dcc.Dropdown(id='dropdown1',
                 options=data.Year.unique(),
                 value=2023),
    html.Br(),
    dcc.Dropdown(id='dropdown2',
                 options=platforms,
                 value='streams'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'))
def plot(year, column):
    df = pd.DataFrame(data.groupby(['artist(s)_name', data.release_date.dt.year, 'track_name'], as_index=False)\
                  .mean().sort_values('Year', ascending=False))

    dfm = df.loc[df['Year'] == year].sort_values(column, ascending=False)[:20]

    fig = px.bar(
          dfm,
          x='track_name',
          y=column,
          color='track_name',
          title=f"Top 20 Song from Release Year of {year} with Most Presence in {column.replace('_', ' ').title()}"
      ).update_layout(width=1000, height=650, xaxis=dict(tickangle=45))\
       .add_hline(
          y=dfm[column].mean(),
          line_dash='dash',
          line_color='grey')

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9016)

In [None]:
#Musical Audio Feature/Mode/Major Analysis

feature = ['speechiness_%', 'liveness_%', 'instrumentalness_%',
           'acousticness_%', 'energy_%', 'valence_%', 'danceability_%', 'bpm']

data.groupby(['mode','key'], as_index=False).mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Mode/Key Selection:'),
    dcc.Dropdown(id='dropdown1',
                options=data['mode'].unique(),
                value='Major'),
    html.Br(),
    dcc.Dropdown(id='dropdown2',
                options= feature,
                value='danceability_%'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'),Input('dropdown1', 'value'),Input('dropdown2', 'value'))
def plot(mode, column):
    df = data.groupby(['mode','key'], as_index=False).mean()\
         .query(f"mode == '{mode}'").sort_values(column, ascending=False)
    
    fig = px.bar(
        df, 
        x='key',
        y=column,
        color='key',
        title=f"Avg {column.replace('_', '').title()} with Songs of {mode} Mode in Different Keys"
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9023)

In [None]:
data.groupby('Year', as_index=False).mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Audio Feature Selection:'),
    dcc.Dropdown(id='dropdown2',
                options=['danceability_%','valence_%','energy_%','acousticness_%',
                         'instrumentalness_%','liveness_%','speechiness_%'],
                value='danceability_%'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown2', 'value'))
def plot(column):
    df = data.groupby('Year', as_index=False).mean()
    
    fig = px.line(
        df, 
        x='Year',
        y=column,
        title=f'{column.replace("_", " ").title()} over the Years'
    )

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9018)


In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Audio Feature Selection:'),
    dcc.Dropdown(id='dropdown2',
                options=['danceability_%','valence_%','energy_%','acousticness_%',
                         'instrumentalness_%','liveness_%','speechiness_%'],
                value='danceability_%'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown2', 'value'))
def plot(column):
    
    fig = px.box(
        data, 
        x='Year',
        y=column,
        title=f'{column.replace("_", " ").title()} over the Years'
    )

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9021)

In [None]:
#Artists with highest Avg(audio feature)

#Find the artist with more than one song
duplicates = data[data.duplicated(subset='artist(s)_name')]

duplicates.groupby('artist(s)_name', as_index=False).mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Audio Feature Selection:'),
    dcc.Dropdown(id='dropdown2',
                options=['danceability_%','valence_%','energy_%','acousticness_%',
                         'instrumentalness_%','liveness_%','speechiness_%'],
                value='danceability_%'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown2', 'value'))
def plot(column):

    df = duplicates.groupby('artist(s)_name', as_index=False).mean().sort_values(column, ascending=False)[:15]
    
    fig = px.bar(
        df, 
        x='artist(s)_name',
        y=column,
        color='streams',
        text_auto=True,
        title = f'Top 15 Artists with Highest Avg {column.replace("_", " ").title()} with More than 1 Song in Data'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9022)

Cross Platfrom Matrix

In [None]:
platforms = ['in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'streams']

data[platforms].corr()

In [None]:
sns.heatmap(data[platforms].corr(), annot=True, cmap='OrRd')

In [None]:
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown1',
                options=data[platforms].corr().columns,
                value='in_spotify_playlists'),
    html.Br(),
    dcc.Dropdown(id='dropdown2',
                options=data[platforms].corr().columns,
                value='in_apple_playlists'),
    html.Br(),
    dcc.RadioItems(id='items',
                  options=['Trendline On', 'Trendline Off'],
                  value='Trendline Off'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'),
             Input('items', 'value'))
def plot(x, y, switch):
    df = data[platforms]
    fig = px.scatter(
        df,
        x=x,
        y=y,
        color=y,
        size=x,
        trendline=None if switch=='Trendline Off' else 'ols',
        title=f'{x.replace("_", " ").title()} vs. {y.replace("_", " ").title()}'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9019)

Audio Feature vs. Song Popularity

In [None]:
df = data.drop(571)

In [None]:
features = ['danceability_%','valence_%','energy_%','acousticness_%',
                         'instrumentalness_%','liveness_%','speechiness_%', 'streams']

df[features].corr()

In [None]:
sns.heatmap(df[features].corr(), annot=True, cmap='BuGn')

In [None]:
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown1',
                options=df[features].corr().columns,
                value='danceability_%'),
    html.Br(),
    dcc.Dropdown(id='dropdown2',
                options=df[features].corr().columns,
                value='streams'),
    html.Br(),
    dcc.RadioItems(id='items',
                  options=['Trendline On', 'Trendline Off'],
                  value='Trendline Off'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown1', 'value'), Input('dropdown2', 'value'),
             Input('items', 'value'))
def plot(x, y, switch):
    dfm = df[features]
    fig = px.scatter(
        dfm,
        x=x,
        y=y,
        color='streams',
        trendline=None if switch=='Trendline Off' else 'ols',
        title=f'{x.replace("_", " ").title()} vs. {y.replace("_", " ").title()}'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=9020)

In [None]:
#Audio Features vs. Number of Streams
px.scatter(
    df[features],
    x=features,
    y='streams',
    labels={'variable':'Audio Feature'},
    title="Audio Features vs. Number of Streams"
).update_layout(width=1100)

In [None]:
dfm = data.copy()

In [None]:
dfm.head()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
for x in ['key', 'mode']:
    dfm[x] = label_encoder.fit_transform(dfm[x])

In [None]:
df= dfm.drop(['track_name', 'artist(s)_name', 'release_date'], axis=1)
df.head()

In [None]:
df.corr()

In [None]:
def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('streams', axis=1)
y = df.loc[:, 'streams']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Logistic Regression

In [None]:
logist = LogisticRegression()

In [None]:
logist.fit(X_train, y_train)

pred = logist.predict(X_test)

In [None]:
mean_squared_error(pred, y_test)

In [None]:
sns.kdeplot(x=pred,shade=True)
sns.kdeplot(x=y_test,shade=True)
plt.legend(['Prediction', 'y_test'])

In [None]:
#Grid Search/Parameter tunning

In [None]:
logist.get_params()

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 1000, 10000]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(logist,
                          param_grid,
                          cv=2,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          refit=True)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
model = grid_search.best_estimator_

model.fit(X_test, y_test)

In [None]:
pred = model.predict(X_test)

In [None]:
mean_squared_error(pred, y_test)

In [None]:
sns.kdeplot(x=pred,shade=True)
sns.kdeplot(x=y_test,shade=True)
plt.legend(['Prediction', 'y_test'])