# DSCI 521 Project
## Data Collection

In this notebook, I collect my data from Spotify and Billboard charts. 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


!pip install billboard.py # installing billboard package
!pip install tekore #installing Spotify API

import matplotlib.pyplot as plt
import pandas as pd
import tekore as tk
from collections import Counter


Mounted at /content/gdrive
Collecting billboard.py
  Downloading https://files.pythonhosted.org/packages/88/47/0620193719b369221550d7f676c5b5fe3c30649612b42675ee13826ba4fe/billboard.py-6.2.1-py2.py3-none-any.whl
Installing collected packages: billboard.py
Successfully installed billboard.py-6.2.1
Collecting tekore
[?25l  Downloading https://files.pythonhosted.org/packages/b5/95/99dd5ac061fadba1bf9ca8998ee6c80d06f4081b2e17c697971555f78411/tekore-3.7.0-py3-none-any.whl (73kB)
[K     |████████████████████████████████| 81kB 3.6MB/s 
[?25hCollecting httpx<0.18,>=0.11
[?25l  Downloading https://files.pythonhosted.org/packages/6f/87/241b482701ba3d58831a4af50bf2880259dc7c11a89504a3b0d70fa298cd/httpx-0.17.1-py3-none-any.whl (68kB)
[K     |████████████████████████████████| 71kB 3.8MB/s 
[?25hCollecting sniffio
  Downloading https://files.pythonhosted.org/packages/52/b0/7b2e028b63d092804b6794595871f936aafa5e9322dcaaad50ebf67445b3/sniffio-1.2.0-py3-none-any.whl
Collecting httpcore<0.13,>=0.1

In [None]:
#getting the hot 100 songs for each year using billboard package
import billboard
def getChart(year_):
  songs = []
  artist = []
  weeks = []
  chart = billboard.ChartData('hot-100-songs', year = year_)
  for song in chart.entries:
    songs.append(song.title)
    artist.append(song.artist)
  df = pd.DataFrame()
  df['Song'] = songs
  df['Artist'] = artist
  return df


mergedData = getChart(2010)
mergedData['Year'] = [2010]*len(mergedData)

for year in range(2011,2021):
  newData = getChart(year)
  newData['Year'] = [year]*len(newData)
  mergedData= pd.concat([mergedData,newData],ignore_index= True)


In [None]:
## Now using Spotify API to get the features for the songs

client_id = '' #ENTER YOUR CLIENT ID HERE 
client_secret = '' #ENTER YOUR CLIENT ID HERE 
app_token = tk.request_client_token(client_id, client_secret)
spotify = tk.Spotify(app_token)


In [None]:
## I created a spotify playlist for each of the years and the IDs are in the list below 
## Yes, there probably was an easier way to get all the songs without manually creating the playlists, but I am my own worst enemy
playlistID = ['5JqDZ2S5cLj3OYL5buxvZQ','7o4icBSvSpL5ATAChnGCdo','1bJzdBdgiENBlst3buzAoT','5IL1MMhloP0ipW68tLoSBs','1DuL8zgvX5J3CN85rMP4Gl','6u4K2ZL09o2oQdxfN6i7Hm','7zMKIWSPN13I1mLZlZE5CB','5sop1gPF80py1YLBFLkeCk','3CFOuTVzEkLChzZxpfTq1k','1fqWYvTWga2MospXXNGa7e','3VojwtzoD9TLD1ojR9ZbxW']
ids = []
acousticness = []
danceability = []
duration_ms = []
energy = []
instrumentalness = []
key = []
liveness = []
loudness = []
speechiness = []
tempo = []
time_signature = []
valence = []
popularity = []
genres = []
for ID in playlistID:
  yearIDs = []
  artistIDs = []
  p = spotify.playlist(ID)
  for song in p.tracks.items:
    id = song.track.id
    artistID = song.track.artists[0].id
    ids.append(id)
    yearIDs.append(id)
    artistIDs.append(artistID)
  f = spotify.tracks_audio_features(yearIDs)
  for item in f:
    acousticness.append(item.acousticness)
    danceability.append(item.danceability)
    duration_ms.append(item.duration_ms)
    energy.append(item.energy)
    instrumentalness.append(item.instrumentalness)
    key.append(item.key)
    liveness.append(item.liveness)
    loudness.append(item.loudness)
    speechiness.append(item.speechiness)
    tempo.append(item.tempo)
    time_signature.append(item.time_signature)
    valence.append(item.valence)
  for i in range(0,2): #I can only give it 50 artists at a time so I split up the 100 into 2 groups
    if i == 0:
      start = 0
      end = 50
    else:
      start = 50
      end = 100
    artID = artistIDs[start:end] 
    a = spotify.artists(artID)
    for person in a:
      popularity.append(person.popularity)
      genres.append(",".join(person.genres))

mergedData['Year'] = mergedData['Year'].astype(int)
mergedData['SongID'] = ids
mergedData['acousticness'] = acousticness
mergedData['danceability'] = danceability
mergedData['duration_ms'] = duration_ms
mergedData['energy'] = energy
mergedData['instrumentalness'] = instrumentalness
mergedData['key'] = key
mergedData['liveness'] = liveness
mergedData['loudness'] = loudness
mergedData['speechiness'] = speechiness
mergedData['tempo'] = tempo
mergedData['time_signature'] = time_signature
mergedData['valence'] = valence
mergedData['artist_popularity'] = popularity
mergedData['artist_genres'] = genres


In [None]:
genres = pd.DataFrame()
all_genres = []
d = Counter()
for row in mergedData['artist_genres']:
  for g in row.split(','):
    g = "".join(g.lower().split())
    all_genres.append(g)
    d[g] +=1
genres['allGenres'] = all_genres

topgenre = []
for row in mergedData['artist_genres']:
  mostPop = []
  for g in row.split(','):
    g = "".join(g.lower().split())
    mostPop.append(d[g])
  m = max(mostPop)
  a = mostPop.index(m)
  topgenre.append(row.split(',')[a])


mergedData['top_genre'] = topgenre
mergedData['top_genre']  = pd.Categorical(mergedData["top_genre"])
mergedData['genre_code'] = mergedData["top_genre"].cat.codes

In [None]:
mergedData # the resulting dataset

Unnamed: 0,Song,Artist,Year,SongID,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,artist_popularity,artist_genres,top_genre,genre_code
0,TiK ToK,Ke$ha,2010,0HPD5WQqrq7wPWR7P7Dw1i,0.09910,0.755,199693,0.837,0.000000,2,0.2890,-2.718,0.1420,120.028,4,0.714,82,"dance pop,electropop,pop,post-teen pop",pop,36
1,Need You Now,Lady Antebellum,2010,11EX5yhxr9Ihl3IN1asrfK,0.09270,0.587,277573,0.622,0.000636,4,0.2000,-5.535,0.0303,107.943,4,0.231,74,"contemporary country,country,country dawn,coun...",contemporary country,12
2,"Hey, Soul Sister",Train,2010,4HlFJV71xXKIGcU3kRyttv,0.18500,0.673,216773,0.886,0.000000,1,0.0826,-4.440,0.0431,97.012,4,0.795,78,"dance pop,neo mellow,pop,pop rock,post-teen pop",pop,36
3,California Gurls,Katy Perry Featuring Snoop Dogg,2010,6tS3XVuOyu10897O3ae7bi,0.00446,0.791,234653,0.754,0.000000,0,0.1630,-3.729,0.0569,125.014,4,0.425,88,"dance pop,pop,post-teen pop",pop,36
4,OMG,Usher Featuring will.i.am,2010,1bM50INir8voAkVoKuvEUI,0.19800,0.781,269493,0.745,0.000011,4,0.3600,-5.810,0.0332,129.998,4,0.326,84,"atl hip hop,dance pop,pop,r&b,south carolina h...",pop,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,0.60100,0.621,216573,0.882,0.000000,6,0.1320,-5.010,0.0459,126.014,4,0.574,88,contemporary country,contemporary country,12
1094,Lovin' On You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,0.00165,0.572,194867,0.949,0.000195,4,0.1630,-4.865,0.0600,118.974,4,0.530,85,contemporary country,contemporary country,12
1095,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,0.01850,0.929,155168,0.667,0.000000,8,0.1000,-6.789,0.3530,126.998,4,0.274,84,"hip hop,memphis hip hop,rap,southern hip hop,t...",rap,42
1096,Slide,H.E.R. Featuring YG,2020,2rTnVB1bvwxHtaIl4uVu7f,0.08070,0.827,238321,0.469,0.000008,10,0.2070,-9.141,0.3410,97.028,4,0.196,83,"alternative r&b,dance pop,pop,r&b",pop,36


In [None]:
mergedData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098 entries, 0 to 1097
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Song               1098 non-null   object  
 1   Artist             1098 non-null   object  
 2   Year               1098 non-null   int64   
 3   SongID             1098 non-null   object  
 4   acousticness       1098 non-null   float64 
 5   danceability       1098 non-null   float64 
 6   duration_ms        1098 non-null   int64   
 7   energy             1098 non-null   float64 
 8   instrumentalness   1098 non-null   float64 
 9   key                1098 non-null   int64   
 10  liveness           1098 non-null   float64 
 11  loudness           1098 non-null   float64 
 12  speechiness        1098 non-null   float64 
 13  tempo              1098 non-null   float64 
 14  time_signature     1098 non-null   int64   
 15  valence            1098 non-null   float64 
 16  artist

## Adding target data

In [None]:
top10 = []
for i in mergedData.index.to_list():
  if i%100 <=10:
    top10.append(1)
  else: 
    top10.append(0)
  
mergedData['top10'] = top10
mergedData

top5 = [] ## adding a top 5 category 
for i in mergedData.index.to_list():
  if (i%100 >=0) & (i%100 <5):
    top5.append(1)
  else:
    top5.append(0)

  
mergedData['top5'] = top5
mergedData

Unnamed: 0,Song,Artist,Year,SongID,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,artist_popularity,artist_genres,top_genre,genre_code,top10,top5
0,TiK ToK,Ke$ha,2010,0HPD5WQqrq7wPWR7P7Dw1i,0.09910,0.755,199693,0.837,0.000000,2,0.2890,-2.718,0.1420,120.028,4,0.714,82,"dance pop,electropop,pop,post-teen pop",pop,36,1,1
1,Need You Now,Lady Antebellum,2010,11EX5yhxr9Ihl3IN1asrfK,0.09270,0.587,277573,0.622,0.000636,4,0.2000,-5.535,0.0303,107.943,4,0.231,74,"contemporary country,country,country dawn,coun...",contemporary country,12,1,1
2,"Hey, Soul Sister",Train,2010,4HlFJV71xXKIGcU3kRyttv,0.18500,0.673,216773,0.886,0.000000,1,0.0826,-4.440,0.0431,97.012,4,0.795,78,"dance pop,neo mellow,pop,pop rock,post-teen pop",pop,36,1,1
3,California Gurls,Katy Perry Featuring Snoop Dogg,2010,6tS3XVuOyu10897O3ae7bi,0.00446,0.791,234653,0.754,0.000000,0,0.1630,-3.729,0.0569,125.014,4,0.425,88,"dance pop,pop,post-teen pop",pop,36,1,1
4,OMG,Usher Featuring will.i.am,2010,1bM50INir8voAkVoKuvEUI,0.19800,0.781,269493,0.745,0.000011,4,0.3600,-5.810,0.0332,129.998,4,0.326,84,"atl hip hop,dance pop,pop,r&b,south carolina h...",pop,36,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,0.60100,0.621,216573,0.882,0.000000,6,0.1320,-5.010,0.0459,126.014,4,0.574,88,contemporary country,contemporary country,12,0,0
1094,Lovin' On You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,0.00165,0.572,194867,0.949,0.000195,4,0.1630,-4.865,0.0600,118.974,4,0.530,85,contemporary country,contemporary country,12,0,0
1095,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,0.01850,0.929,155168,0.667,0.000000,8,0.1000,-6.789,0.3530,126.998,4,0.274,84,"hip hop,memphis hip hop,rap,southern hip hop,t...",rap,42,0,0
1096,Slide,H.E.R. Featuring YG,2020,2rTnVB1bvwxHtaIl4uVu7f,0.08070,0.827,238321,0.469,0.000008,10,0.2070,-9.141,0.3410,97.028,4,0.196,83,"alternative r&b,dance pop,pop,r&b",pop,36,0,0


In [None]:
path =  '/content/gdrive/My Drive/DSCI521/project-final'
mergedData.to_csv(path + '/data/BillboardData2.csv') #save dataframe to folder 