In [19]:
import pandas as pd
import numpy as np
import matplotlib as mlt

In [20]:
# Import DataFrame
rawData = pd.read_csv("apple_music.csv")
print("shape of the original data:", rawData.shape, "\n")

# Make a Working Copy & select olumns that will be used
wrkData = rawData.copy()
wrkData = wrkData[['Title', 'Artist', 'Composer', 'Album', 'Genre', 
                   'Track Year', 'Track Number On Album', 'Track Count On Album', 
                   'Track Duration', 'Track Play Count', 'Date Added To Library', 
                   'Last Played Date', 'Skip Count']]
print(wrkData.loc[0:1, ["Title", "Artist", "Last Played Date"]])

shape of the original data: (3119, 53) 

                     Title       Artist      Last Played Date
0  Never Gonna Give You Up  Rick Astley  2021-12-10T05:47:34Z
1              Lay Me Down    Sam Smith                   NaN


In [21]:
# Raname columns (inplace = True modifies the instance itself)
wrkData.rename(
    columns={"Title" : "title", "Artist" : "artist", "Composer" : "composer",
             "Album" : "album", "Genre" : "genre", "Track Year": "year", 
             "Track Number On Album" : "track no. on album", "Track Count On Album" : "album track num", 
             "Track Duration" : "duration", "Track Play Count" : "play count", 
             "Date Added To Library" : "added date", "Last Played Date" : "last played", 
             "Skip Count" : "skipped"}, inplace = True)
print(wrkData.columns)

Index(['title', 'artist', 'composer', 'album', 'genre', 'year',
       'track no. on album', 'album track num', 'duration', 'play count',
       'added date', 'last played', 'skipped'],
      dtype='object')


In [22]:
# remove NAs in artist column (3119 -> 3061 rows)
wrkData.dropna(subset = ['artist'], inplace = True)
print(wrkData.shape)

(3061, 13)


In [23]:
# Removing inconsistencies
min_year = min(wrkData.loc[:, "year"])
# placeholder 0 is being treated as actual year
wrkData.loc[wrkData["year"] == 0, "year"] = None # replace 0 with None
min_year = min(wrkData.loc[:, "year"])

In [24]:
# look up data types
print(wrkData.dtypes)

# we realise that added date last played should be of type date instead of string
wrkData["added date"] = pd.to_datetime(wrkData["added date"])
wrkData["last played"] = pd.to_datetime(wrkData["last played"], errors="coerce")

print(wrkData.loc[0:5, ["added date", "last played"]])


title                  object
artist                 object
composer               object
album                  object
genre                  object
year                  float64
track no. on album      int64
album track num         int64
duration                int64
play count              int64
added date             object
last played            object
skipped                 int64
dtype: object
                 added date               last played
0 2016-10-01 01:30:21+00:00 2021-12-10 05:47:34+00:00
1 2016-10-01 01:31:08+00:00                       NaT
2 2016-10-01 01:31:28+00:00 2021-10-03 17:51:03+00:00
3 2016-10-01 01:32:13+00:00 2021-12-14 03:17:54+00:00
4 2016-10-01 01:32:28+00:00 2021-11-03 15:32:58+00:00
5 2016-10-01 01:32:48+00:00 2021-10-05 18:07:36+00:00


In [25]:
# Round a column to less precision
wrkData["duration"] = round(wrkData["duration"] / 1000)

In [26]:
## practice: count number of composers for each song
wrkData["composer count"] = wrkData["composer"].str.count(",") + \
    wrkData["composer"].str.count("&") + 1
print(wrkData.loc[:, ["composer", "composer count"]])

                                               composer  composer count
0                                 Stock Aitken Waterman             1.0
1                 Sam Smith, James Napier & Elvin Smith             3.0
2     Ian Stanley, Roland Orzabal & Christopher Merr...             3.0
3                                     Sylvester Stewart             1.0
4        Benny Andersson, Stig Anderson & Björn Ulvaeus             3.0
...                                                 ...             ...
3114                                  Michael Rosenberg             1.0
3115  Dua Lipa, Clarence Coffee Jr, Sarah Hudson & S...             4.0
3116  A Lasry/J Lawrence & C Trenet, Charles Trenet ...             4.0
3117  Supergrass, Daniel Goffey, Gareth Coombes & Mi...             4.0
3118                                                NaN             NaN

[3061 rows x 2 columns]


In [40]:
# np.where (value returned for t/f value)
on_tour = (wrkData["artist"] == "Harry Styles")
on_tour = np.where(on_tour, "yes", "no")
[(i, on_tour[i]) for i in range(len(on_tour)) if on_tour[i] == "yes"]

[(1678, 'yes'), (1685, 'yes'), (2780, 'yes')]

In [74]:
# Rap genre are unsafe
# Test by printing all songs by Black Eyed Peas
wrkData["SFW"] = np.where(wrkData["genre"].str.contains("Rap"), "unsafe", "safe")
print(wrkData[wrkData["artist"] == "Black Eyed Peas"]
      .loc[:, ["artist", "genre", "SFW"]]) #.loc[:3, :])


               artist        genre     SFW
1666  Black Eyed Peas          Pop    safe
1667  Black Eyed Peas  Hip-Hop/Rap  unsafe
1668  Black Eyed Peas          Pop    safe
1669  Black Eyed Peas          Pop    safe
1670  Black Eyed Peas          Pop    safe
3077  Black Eyed Peas          Pop    safe
