In this notebook, we create and export the inputs for the regression, namely the *relative relevance scores* for each month and topic. 

In [2]:
import pandas as pd

# Import data

In [3]:
# Import the data with the topic relevance per document
df = pd.read_csv('billboard_hot_100_2004_2024_with_cnet_topic_loadings.csv')

df.head()

Unnamed: 0,year,ranking,title,artist,lyrics,song_url,producers,writers,label,released_on,...,USA,Electro-Pop,Electronic,lyrics_lower,lyrics_lemmatized,relative_relevance_health,relative_relevance_justice,relative_relevance_money,relative_relevance_police,relative_relevance_politics
0,2004,1,Yeah!,Usher featuring Lil Jon and Ludacris,"[Intro: USHER & Lil Jon]\nPeace up, A-Town dow...",https://genius.com/usher-yeah-lyrics,['Lil Jon'],"['Sean Garrett', 'Rob McDowell', 'Patrick “J. ...","['Arista Records', 'LaFace Records']",2004-01-27,...,1,0,0,usher lil jon peace town yeah yeah okay okay u...,usher lil jon peace town yeah yeah okay okay u...,0.0,0.0,0.0,0.0,0.0
1,2004,2,Burn,Usher,"[Intro]\nGirl, I understand why\nSee, it's bur...",https://genius.com/usher-burn-lyrics,"['Jermaine Dupri', 'Bryan-Michael Cox']","['Jermaine Dupri', 'Bryan-Michael Cox', 'USHER']","['LaFace Records', 'Arista Records']",2004-03-21,...,1,0,0,girl understand see burnin hold know somethin ...,girl understand see burnin hold know somethin ...,0.029409,0.0,0.0,0.0,0.0
2,2004,3,If I Ain't Got You,Alicia Keys,"[Intro]\nMm-mm\nMm-mm, oh\nMm-mm\n[Verse 1]\nS...",https://genius.com/alicia-keys-if-i-aint-got-y...,['Alicia Keys'],['Alicia Keys'],['J Records'],2004-02-17,...,1,0,0,mm mm mm mm oh mm mm people live fortune peopl...,mm mm mm mm oh mm mm people live fortune peopl...,0.098985,0.180572,0.0,0.03826,0.0
3,2004,5,The Way You Move,OutKast featuring Sleepy Brown,"[Intro: Big Boi]\nBoom, boom, boom\nHa-ha!\n[V...",https://genius.com/outkast-the-way-you-move-ly...,"['Big Boi', 'Carl Mo']","['Sleepy Brown', 'Big Boi', 'Carl Mo']","['Arista Records', 'LaFace Records']",2003-09-23,...,1,0,0,big boi boom boom boom ha ha big boi ready act...,big boi boom boom boom ha ha big boi ready act...,0.0,0.0,0.012878,0.0,0.0
4,2004,6,The Reason,Hoobastank,[Verse 1]\nI'm not a perfect person\nThere's m...,https://genius.com/hoobastank-the-reason-lyrics,['Howard Benson'],"['Dan Estrin', 'Douglas Robb']","['Def Jam Recordings', 'Island Records']",2003-12-09,...,1,0,0,perfect person many things wish continue learn...,perfect person many thing wish continue learni...,0.0,0.0,0.0,0.0,0.0


# Creation of inputs for the regression

Below, we create data frames where each row is a month-year, and each of the columns corresponds to the relative relevance score for a certain topic, as obtained from the ConceptNet method.

In [4]:
# Get only the columns with relative relevance scores
relevance_cols = [col for col in df.columns if col.startswith('relative_relevance_')]

# Get list of columns to group by
cols_groupby = ['release_year', 'release_month', 'release_year_month']

# Now, we group results of df by release year and month, and calculate the mean of the relevance scores
df_subset = df[cols_groupby + relevance_cols]
df_grouped = df_subset.groupby(cols_groupby).mean().reset_index()

df_grouped.head()

Unnamed: 0,release_year,release_month,release_year_month,relative_relevance_health,relative_relevance_justice,relative_relevance_money,relative_relevance_police,relative_relevance_politics
0,2003,2,2003-02,0.040572,0.023197,0.0,0.004915,0.0
1,2003,3,2003-03,0.0,0.043855,0.0,0.040851,0.0
2,2003,6,2003-06,0.0,0.0,0.008137,0.0,0.0
3,2003,7,2003-07,0.009288,0.0,0.006643,0.0,0.0
4,2003,8,2003-08,0.015498,0.013493,0.033819,0.0,0.0


In [6]:
# Export grouped results
df_grouped.to_csv('cnet_regression_inputs.csv', index=False)

Now, we only consider those songs tagged as "USA".

In [5]:
# Get oly the columns tagged as "USA"
df_usa = df[df['USA'] == 1]

# Get only the columns with relative relevance scores
relevance_cols = [col for col in df.columns if col.startswith('relative_relevance_')]

# Get list of columns to group by
cols_groupby = ['release_year', 'release_month', 'release_year_month']

# Now, we group results of df by release year and month, and calculate the mean of the relevance scores
df_subset_usa = df_usa[cols_groupby + relevance_cols]
df_grouped_usa = df_subset_usa.groupby(cols_groupby).mean().reset_index()

df_grouped_usa.head()

Unnamed: 0,release_year,release_month,release_year_month,relative_relevance_health,relative_relevance_justice,relative_relevance_money,relative_relevance_police,relative_relevance_politics
0,2003,3,2003-03,0.0,0.043855,0.0,0.040851,0.0
1,2003,6,2003-06,0.0,0.0,0.008137,0.0,0.0
2,2003,8,2003-08,0.014228,0.015999,0.0,0.0,0.0
3,2003,9,2003-09,0.030631,0.0,0.002146,0.0,0.0
4,2003,10,2003-10,0.009923,0.049236,0.012928,0.002096,0.0


In [7]:
# Export grouped results only with USA songs
df_grouped_usa.to_csv('cnet_regression_inputs_usa.csv', index=False)