In [127]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('data/billboard-200.db')

In [128]:
albums_table = pd.read_sql_query('select * from albums', conn)[1:]

## Cleaning

Convert rank to integer, and add a column named 'power' to represent the value of each album's position on the ranking. Higher ranked means more power.

In [129]:
albums_table['rank'] = pd.to_numeric(albums_table['rank'], downcast='signed')
albums_table['power'] = 1 / albums_table['rank']
albums_table['date'] = pd.to_datetime(albums_table['date'])

In [130]:
albums_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573946 entries, 1 to 573946
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   id            573946 non-null  int64         
 1   date          573946 non-null  datetime64[ns]
 2   artist        573946 non-null  object        
 3   album         573946 non-null  object        
 4   rank          573946 non-null  int16         
 5   length        492547 non-null  float64       
 6   track_length  468867 non-null  float64       
 7   power         573946 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int16(1), int64(1), object(2)
memory usage: 31.7+ MB


In [131]:
albums_table.head()

Unnamed: 0,id,date,artist,album,rank,length,track_length,power
1,2,2019-01-19,A Boogie Wit da Hoodie,Hoodie SZN,1,20.0,185233.8,1.0
2,3,2019-01-19,21 Savage,I Am > I Was,2,15.0,211050.733333,0.5
3,4,2019-01-19,Soundtrack,Spider-Man: Into The Spider-Verse,3,13.0,190866.384615,0.333333
4,5,2019-01-19,Meek Mill,Championships,4,19.0,219173.894737,0.25
5,6,2019-01-19,Post Malone,beerbongs & bentleys,5,18.0,214113.611111,0.2


In [138]:
# fill in missing artist names
albums_table[albums_table['album']=='Silhouette'] = albums_table[albums_table['album']=='Silhouette'].replace({'':'Kenny G'})
albums_table[albums_table['album']=='Roots Of Country Music (1965)'] = albums_table[albums_table['album']=='Roots Of Country Music (1965)'].replace({'':'Various Artists'})

## Organizing and Refactoring

Create two subtables; one to represent every album that appears in the rankings, and one represent each artist that appears.

In [139]:
# albums: aggregated data about every album
albums = (albums_table.groupby(['album', 'artist'])
                      .agg({'power': 'sum',
                            'rank' : ['count', 'mean'],
                            'date' : ['min', 'max'],
                            'length': 'first',
                            'track_length': 'first'})
         )

albums.columns = ['power_rank',
                  'num_appearances',
                  'average_rank',
                  'first_appearance',
                  'last_appearance',
                  'length',
                  'track_length']
albums = albums.reset_index()

In [142]:
# artists: aggregated data about every artist
artists = (albums_table.groupby('artist')
                       .agg({'power': 'sum',
                             'rank' : ['count', 'mean'],
                             'album': 'nunique',
                             'date' : ['min', 'max']})
          )

artists.columns = ['power_rank',
                   'num_appearances',
                   'average_rank',
                   'num_albums',
                   'first_appearance',
                   'last_appearance',]

In [147]:
# artists.to_csv('data/artists.csv')
# albums.to_csv('data/albums.csv')

In [151]:
albums.sort_values('power_rank', ascending=False).head(30)

Unnamed: 0,album,artist,power_rank,num_appearances,average_rank,first_appearance,last_appearance,length,track_length
32333,Thriller,Michael Jackson,53.968268,348,100.962644,1982-12-25,2019-01-19,30.0,282689.866667
405,21,Adele,49.533427,396,76.05303,2011-03-12,2019-01-19,11.0,261895.454545
24520,Rumours,Fleetwood Mac,41.450505,303,97.943894,1977-02-26,2019-01-19,58.0,
7259,Days Of Wine And Roses,Andy Williams,36.437628,122,39.491803,1963-01-05,1965-06-05,12.0,162825.333333
22496,Please Hammer Don't Hurt 'Em,M.C. Hammer,32.843873,108,43.62963,1990-03-10,1992-05-02,,
31296,The Sound Of Music,Soundtrack,31.898269,238,40.281513,1965-03-20,2015-04-04,16.0,169718.875
4491,Born In The U.S.A.,Bruce Springsteen,31.812714,143,44.797203,1984-06-23,2015-07-25,12.0,240010.75
15031,Jagged Little Pill,Alanis Morissette,31.546801,127,34.740157,1995-07-01,2016-05-28,13.0,237861.307692
23004,Purple Rain (Soundtrack),Prince And The Revolution,29.410815,126,83.666667,1984-07-14,2019-01-05,9.0,
7742,Dirty Dancing,Soundtrack,28.610932,96,41.697917,1987-09-19,1989-07-15,13.0,203054.0


In [152]:
artists.sort_values('power_rank', ascending=False).head(30)

Unnamed: 0_level_0,power_rank,num_appearances,average_rank,num_albums,first_appearance,last_appearance
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Soundtrack,1031.829957,25177,93.536799,1576,1963-01-05,2019-01-19
Various Artists,358.729772,14867,109.389386,1599,1963-01-05,2019-01-12
The Beatles,238.403536,2786,93.643575,58,1964-02-01,2019-01-19
The Rolling Stones,133.286723,1879,82.704098,54,1964-06-27,2019-01-19
Barbra Streisand,113.40384,1864,68.194206,61,1963-01-05,2018-11-24
Garth Brooks,112.018831,1323,71.811036,21,1990-05-12,2018-01-13
Michael Jackson,110.760227,1631,100.090129,29,1972-02-19,2019-01-19
Mariah Carey,92.423169,886,69.568849,21,1990-06-30,2019-01-12
Elton John,91.140103,1745,88.152436,47,1970-10-03,2019-01-19
Taylor Swift,90.10503,1165,76.998283,9,2006-11-11,2019-01-19
