In [1]:
# Import Statements
import pandas as pd
import dash
from dash import dcc, html, Input, Output
import plotly.express as px

In [2]:
# Load and preprocess the dataset
dataframe_1 = pd.read_csv('songs_normalize.csv')

In [3]:
dataframe_filtered = dataframe_1.drop_duplicates() # Drop the 59 Duplicate rows

In [4]:
print(dataframe_1.dtypes) # No issue with data types

artist               object
song                 object
duration_ms           int64
explicit               bool
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object


In [5]:
dataframe_filtered.nunique() # Unique values

artist               835
song                1879
duration_ms         1793
explicit               2
year                  23
popularity            76
danceability         565
energy               580
key                   12
loudness            1671
mode                   2
speechiness          837
acousticness        1208
instrumentalness     772
liveness             783
valence              760
tempo               1831
genre                 59
dtype: int64

In [6]:
print(dataframe_filtered['genre'].nunique())  # There are 59 unique genres

59


In [7]:
dataframe_filtered = dataframe_filtered.copy()  # Work with a copy

dataframe_filtered.loc[:, 'genre'] = dataframe_filtered['genre'].replace('set()', 'Unknown')
dataframe_filtered.loc[:, 'new_genre'] = dataframe_filtered['genre'].apply(lambda x: x.split(',')[0].strip())

In [8]:
print(dataframe_filtered[['genre', 'new_genre']])

                 genre new_genre
0                  pop       pop
1            rock, pop      rock
2         pop, country       pop
3          rock, metal      rock
4                  pop       pop
...                ...       ...
1995               pop       pop
1996               pop       pop
1997  hip hop, country   hip hop
1998               pop       pop
1999           hip hop   hip hop

[1941 rows x 2 columns]


In [9]:
print(dataframe_filtered['new_genre'].nunique())
print(dataframe_filtered['new_genre'].value_counts())

12
new_genre
pop                  912
hip hop              749
rock                 155
Dance/Electronic      41
Unknown               22
latin                 15
R&B                   13
World/Traditional     10
country               10
metal                  9
Folk/Acoustic          4
easy listening         1
Name: count, dtype: int64


In [10]:
# Keywords to identify edited and featured versions
edited_keywords = ["Radio", "Version", "Remix", "Mix", "Remaster", "(Personal)", "(Interlude)", "Explicit", "Video", "Edit", "*", "from", "Official"]
feature_keywords = ["feat", "with", "Featuring", "Feat", "&", "Vs", " X"]

In [11]:
dataframe_filtered['is_edited'] = dataframe_filtered['song'].apply(lambda x: any(keyword.lower() in str(x).lower() for keyword in edited_keywords))
dataframe_filtered['is_featured'] = dataframe_filtered['song'].apply(lambda x: any(keyword.lower() in str(x).lower() for keyword in feature_keywords))

In [12]:
# Convert explicit to boolean
dataframe_filtered.loc[:, 'explicit'] = dataframe_filtered['explicit'].astype(str).str.upper().map({'TRUE': True, '1': True, 'FALSE': False, '0': False})

In [13]:
# Remove popularity scores of zero which is a missing data point
dataframe_filtered = dataframe_filtered.loc[dataframe_filtered['popularity'] != 0]

In [14]:
len(dataframe_filtered) # There a 1815 rows of remaining data

1815

In [15]:
count_featured = dataframe_filtered['is_featured'].sum()
print(f"Number of featured songs: {count_featured}")

count_edited = dataframe_filtered['is_edited'].sum()
print(f"Number of featured songs: {count_edited}")

Number of featured songs: 337
Number of featured songs: 165


In [16]:
# Write to a csv file
dataframe_filtered.to_csv('filtered_data.csv', index=False)