# 3. One-hot encoding for the genres and cleaning the data.

## Importing the data and cleaning the data.

In [3]:
import pandas as pd

df = pd.read_csv('../data/5_Bru_spotify_million_tracks.csv')

df.head()


Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,album_cover,genres
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008-03-14,14,150040,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,"['chanson', 'french pop', 'french rock', 'nouv..."
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004-03-21,1,253000,False,https://i.scdn.co/image/ab67616d0000b27398d445...,"['chanson', 'french pop']"
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011-02-01,3,240400,False,https://i.scdn.co/image/ab67616d0000b27353a906...,['medieval']
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,1,138760,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,"['canadian metal', 'canadian post-hardcore', '..."
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,0,199986,False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,['pops orchestra']


In [4]:
# Remove '[' and ']' and single quotes from the 'genres' column
df['genres'] = df['genres'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", '') if pd.notnull(x) else '')
df.drop(columns=['album_cover'], inplace=True)
df.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,genres
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008-03-14,14,150040,False,"chanson, french pop, french rock, nouvelle cha..."
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004-03-21,1,253000,False,"chanson, french pop"
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011-02-01,3,240400,False,medieval
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,1,138760,True,"canadian metal, canadian post-hardcore, kingst..."
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,0,199986,False,pops orchestra


In [5]:
# Extract the first 4 characters (year) and convert to integer
df['release_date'] = df['release_date'].str[:4].astype(int)

df.head()


Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,genres
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008,14,150040,False,"chanson, french pop, french rock, nouvelle cha..."
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004,1,253000,False,"chanson, french pop"
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011,3,240400,False,medieval
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007,1,138760,True,"canadian metal, canadian post-hardcore, kingst..."
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022,0,199986,False,pops orchestra


## One-hot encoding the genres column.

In [6]:
df['genres'] = df['genres'].str.split(',')

#Perform one-hot encoding
genres_one_hot = df['genres'].explode().str.strip().str.get_dummies().groupby(level=0).sum()

#Merge the one-hot encoded columns back into the original DataFrame
df = pd.concat([df, genres_one_hot], axis=1)

#Drop the original 'genres' column if no longer needed
df = df.drop(columns=['genres'])

df.head()


Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,"""childrens music""",...,yacht rock,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008,14,150040,False,0,...,0,0,0,0,0,0,0,0,0,0
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004,1,253000,False,0,...,0,0,0,0,0,0,0,0,0,0
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011,3,240400,False,0,...,0,0,0,0,0,0,0,0,0,0
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007,1,138760,True,0,...,0,0,0,0,0,0,0,0,0,0
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022,0,199986,False,0,...,0,0,0,0,0,0,0,0,0,0


## Cleaning the data.

In [7]:
df.shape

(5060, 1541)

In [8]:
# Check if a specific genre is encoded as 1 for any song
rock_songs = df[df['"childrens music"'] == 1]
print("Songs with the genre 'rock':")
rock_songs

Songs with the genre 'rock':


Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,"""childrens music""",...,yacht rock,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco
1542,Boots The Monkey!,Dora The Explorer,Boots The Monkey!,Dora The Explorer,Dora The Explorer,2004,8,38226,False,1,...,0,0,0,0,0,0,0,0,0,0
1659,Ice Cream Cone,Laurie Berkner,Ice Cream Cone,The Laurie Berkner Band,Buzz Buzz,1998,9,172466,False,1,...,0,0,0,0,0,0,0,0,0,0
2825,Tenemos Amigos,Dora The Explorer,Tenemos Amigos,Dora The Explorer,Dora The Explorer,2004,12,57946,False,1,...,0,0,0,0,0,0,0,0,0,0
3631,Travel Song,Dora The Explorer,Travel Song Medley,Dora The Explorer,Dora The Explorer,2004,18,94493,False,1,...,0,0,0,0,0,0,0,0,0,0
4613,ABCD Medley,Laurie Berkner,ABCD Medley,The Laurie Berkner Band,Whaddaya Think Of That?,1997,45,185333,False,1,...,0,0,0,0,0,0,0,0,0,0
4852,Zodiac,Laurie Berkner,Zodiac,The Laurie Berkner Band,Laurie Berkner's Favorite Classic Kids' Songs,2015,8,61000,False,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Convert 'explicit'columns to 1s and 0s
df['explicit'] = df['explicit'].astype(int)
df.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,"""childrens music""",...,yacht rock,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008,14,150040,0,0,...,0,0,0,0,0,0,0,0,0,0
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004,1,253000,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011,3,240400,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007,1,138760,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022,0,199986,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df.shape

(5060, 1541)

In [12]:
df.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,"""childrens music""",...,yacht rock,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008,14,150040,0,0,...,0,0,0,0,0,0,0,0,0,0
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004,1,253000,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011,3,240400,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007,1,138760,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022,0,199986,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Remove double quotes from column names
df.columns = df.columns.str.replace('"', '')
df.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,childrens music,...,yacht rock,ye ye,yodeling,zarzuela,zilizopendwa,zolo,zouglou,zouk,zouk riddim,zydeco
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008,14,150040,0,0,...,0,0,0,0,0,0,0,0,0,0
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004,1,253000,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011,3,240400,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007,1,138760,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022,0,199986,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df.to_csv('../data/6_Bru_one_hot_encoded_is_hot_cleaned.csv', index=False)

In [None]:
# Drop the columns that are not numerical.
df.drop(columns=['original_title', 'original_artist', 'spotify_title', 'spotify_artist', 'album'], inplace=True)

In [None]:
df.head()

In [49]:
df.to_csv('../data/7_Bru_one_hot_encoded_is_hot_cleaned_numerical.csv', index=False)