Need to group all songs by genre, sort into three buckets, try to make them even, about 1000 each if possible


In [4]:
import pandas as pd
import numpy as np
from ast import literal_eval
import json

import nltk
from nltk.tokenize import wordpunct_tokenize

from textstat import flesch_reading_ease
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import repetition_matrix

GENRES = ['pop', 'rap', 'rock']

In [7]:
df_chart_genres = pd.read_csv('data/chart_with_genres_14000_full.csv', usecols=['song', 'artist', 'genres'])

  df_chart_genres = pd.read_csv('data/chart_with_genres_14000_full.csv', usecols=['song', 'artist', 'genres'])


In [8]:
df_chart_genres

Unnamed: 0,song,artist,genres
0,Easy On Me,Adele,"['british soul', 'pop', 'pop soul', 'uk pop']"
1,Stay,The Kid LAROI & Justin Bieber,"['australian hip hop', 'pop']"
2,Industry Baby,Lil Nas X & Jack Harlow,"['lgbtq+ hip hop', 'pop']"
3,Fancy Like,Walker Hayes,"['contemporary country', 'country', 'country r..."
4,Bad Habits,Ed Sheeran,"['pop', 'uk pop']"
...,...,...,...
330082,Over And Over,Thurston Harris,
330083,I Believe In You,Robert & Johnny,
330084,Little Serenade,The Ames Brothers,
330085,I'll Get By (As Long As I Have You),Billy Williams,


In [9]:
def decide_genre(genre_list):
    genre_dict = dict(zip(GENRES, [False]*3))
    for genre in genre_list:
        for g in GENRES:
            if g in genre:
                genre_dict[g] = True
    
    present_genres = []
    for genre, present in genre_dict.items():
        if present:
            present_genres.append(genre)
    if len(present_genres) == 1:
        return present_genres[0]
    return None

In [83]:
seen_songs = set()
genre_dict = dict(zip(GENRES, [[],[],[]]))
for song, artist, genre_list in df_chart_genres.to_numpy():
    song = f"{song} by {artist}"
    if song not in seen_songs:
        seen_songs.add(song)
        try:
            genre_list = literal_eval(genre_list)
            decided_genre = decide_genre(genre_list)
            if decided_genre and len(genre_dict[decided_genre]) < 1000:
                genre_dict[decided_genre].append(song)
        except:
            continue

In [84]:
genre_dict

{'pop': ['Easy On Me by Adele',
  'Stay by The Kid LAROI & Justin Bieber',
  'Industry Baby by Lil Nas X & Jack Harlow',
  'Bad Habits by Ed Sheeran',
  'Shivers by Ed Sheeran',
  'Good 4 U by Olivia Rodrigo',
  'Need To Know by Doja Cat',
  'Levitating by Dua Lipa',
  'Essence by Wizkid Featuring Justin Bieber & Tems',
  'Kiss Me More by Doja Cat Featuring SZA',
  'Heat Waves by Glass Animals',
  'You Right by Doja Cat & The Weeknd',
  'Save Your Tears by The Weeknd & Ariana Grande',
  'Traitor by Olivia Rodrigo',
  'My Universe by Coldplay x BTS',
  'Meet Me At Our Spot by THE ANXIETY: WILLOW & Tyler Cole',
  'Montero (Call Me By Your Name) by Lil Nas X',
  'Chasing After You by Ryan Hurd With Maren Morris',
  'Moth To A Flame by Swedish House Mafia & The Weeknd',
  'Thats What I Want by Lil Nas X',
  'Happier Than Ever by Billie Eilish',
  'Better Days by NEIKED X Mae Muller X Polo G',
  'Ghost by Justin Bieber',
  'A-O-K by Tai Verdes',
  'Leave The Door Open by Silk Sonic (Bruno M

In [85]:
with open("data/songs_grouped_by_genre.json", "x") as f:
    json.dump(genre_dict, f)

FileExistsError: [Errno 17] File exists: 'data/songs_grouped_by_genre.json'

repetitiveness, duration, tempo, uniqueness (avg. tf-idf),  readability

In [86]:
df_all_songs = pd.read_json('data/all_songs.json', orient='index')
df_all_songs

Unnamed: 0,song,artist,chorus
0,Blinding Lights by The Weeknd,The Weeknd,"[I said, ooh, I'm blinded by the lights, No, I..."
1,Radioactive by Imagine Dragons,Imagine Dragons,"[Whoa-oh, whoa, I'm radioactive, radioactive, ..."
2,Sail by AWOLNATION,AWOLNATION,"[Sail, Sail, Sail, Sail, Sail]"
3,How Do I Live by LeAnn Rimes,LeAnn Rimes,"[How do I live without you? I want to know, Ho..."
4,Counting Stars by OneRepublic,OneRepublic,"[Lately, I've been, I've been losin' sleep, Dr..."
...,...,...,...
12396,Cold Beer Drinker by Luke Bryan,Luke Bryan,"[I'm just a cold beer drinker, check out the c..."
12397,Who Can I Count On by Patsy Cline,Patsy Cline,"[Who can I count on, if I can't count on you?,..."
12398,It's All Right by Sam Cooke,Sam Cooke,"[Whoa, it's all right (It's all right), It's a..."
12399,Broken Hearted by The Miracles,The Miracles,"[Broken hearted, ooh since we parted, Just say..."


In [87]:
song2chorus = {song: chorus for song, _, chorus in df_all_songs.to_numpy()}
song2chorus

{'Blinding Lights by The Weeknd': ["I said, ooh, I'm blinded by the lights",
  "No, I can't sleep until I feel your touch",
  "I said, ooh, I'm drowning in the night",
  "Oh, when I'm like this, you're the one I trust",
  'Hey, hey, hey'],
 'Radioactive by Imagine Dragons': ['Whoa-oh, whoa',
  "I'm radioactive, radioactive",
  'Whoa-oh, whoa',
  "I'm radioactive, radioactive"],
 'Sail by AWOLNATION': ['Sail', 'Sail', 'Sail', 'Sail', 'Sail'],
 'How Do I Live by LeAnn Rimes': ['How do I live without you? I want to know',
  'How do I breathe without you if you ever go?',
  'How do I ever, ever survive?',
  'How do I, how do I, oh, how do I live?'],
 'Counting Stars by OneRepublic': ["Lately, I've been, I've been losin' sleep",
  "Dreamin' about the things that we could be",
  "But baby, I've been, I've been prayin' hard",
  "Said no more countin' dollars, we'll be countin' stars",
  "Lately, I've been, I've been losin' sleep",
  "Dreamin' about the things that we could be",
  "But baby, I

In [88]:
def readability(chorus):
    score = flesch_reading_ease(chorus)
    if score > 100:
        score = 100
    elif score < 0:
        score = 0
    return score / 100

def flatten(nested_listed):
    return [item for sublist in nested_listed for item in sublist]

# Calculate tf–idf for all words
vectorizer1 = TfidfVectorizer()
choruses = list(filter(lambda chorus: chorus, [' '.join(song2chorus[song]) if song in song2chorus.keys() else None for song in flatten(list(genre_dict.values()))]))
vectors = vectorizer1.fit_transform(choruses)
feature_names = vectorizer1.get_feature_names()
dense = vectors.todense()
dense_list = dense.tolist()
df_tfidf = pd.DataFrame(dense_list, columns=feature_names)

# Counts
vectorizer2 = CountVectorizer()
vectors = vectorizer2.fit_transform(choruses)
df_counts = pd.DataFrame(vectors.toarray(), columns=feature_names)

def uniqueness(chorus):
    this_chorus_index = choruses.index(chorus) if chorus in choruses else 0
    # print(chorus in choruses)

    this_row_tfidf = df_tfidf.iloc[this_chorus_index].to_list()
    this_row_counts = df_counts.iloc[this_chorus_index].to_list()
    word_count = 0
    total_tfidf = 0
    for tfidf, count in zip(this_row_tfidf, this_row_counts):
        word_count += count
        total_tfidf += tfidf * count
    return total_tfidf / word_count

def create_dict_list_from_song_list(song_list):
    # choruses = [song2chorus[song] if song in song2chorus.keys() else None for song in flatten(list(genre_dict.values()))]
    print(choruses)
    dict_list = [{song: {'readability': readability(chorus), 'uniqueness': uniqueness(chorus)}} for song, chorus in zip(song_list, choruses)]
    return dict_list



In [89]:
genre_dict

{'pop': ['Easy On Me by Adele',
  'Stay by The Kid LAROI & Justin Bieber',
  'Industry Baby by Lil Nas X & Jack Harlow',
  'Bad Habits by Ed Sheeran',
  'Shivers by Ed Sheeran',
  'Good 4 U by Olivia Rodrigo',
  'Need To Know by Doja Cat',
  'Levitating by Dua Lipa',
  'Essence by Wizkid Featuring Justin Bieber & Tems',
  'Kiss Me More by Doja Cat Featuring SZA',
  'Heat Waves by Glass Animals',
  'You Right by Doja Cat & The Weeknd',
  'Save Your Tears by The Weeknd & Ariana Grande',
  'Traitor by Olivia Rodrigo',
  'My Universe by Coldplay x BTS',
  'Meet Me At Our Spot by THE ANXIETY: WILLOW & Tyler Cole',
  'Montero (Call Me By Your Name) by Lil Nas X',
  'Chasing After You by Ryan Hurd With Maren Morris',
  'Moth To A Flame by Swedish House Mafia & The Weeknd',
  'Thats What I Want by Lil Nas X',
  'Happier Than Ever by Billie Eilish',
  'Better Days by NEIKED X Mae Muller X Polo G',
  'Ghost by Justin Bieber',
  'A-O-K by Tai Verdes',
  'Leave The Door Open by Silk Sonic (Bruno M

In [90]:
df_counts

Unnamed: 0,100k,106you,12,14,1600,16th,1embed,20,2001,2006,...,일도,잃은,잡아,차가운,천천히,침묵하는,팔레트,하루,하루가,홀린
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
' '.join([
            "Lately, I've been, I've been losin' sleep",
            "Dreamin' about the things that we could be",
            "But baby, I've been, I've been prayin' hard",
            "Said no more countin' dollars, we'll be countin' stars",
            "Lately, I've been, I've been losin' sleep",
            "Dreamin' about the things that we could be",
            "But baby, I've been, I've been prayin' hard",
            "Said no more countin' dollars, we'll be, we'll be countin' stars"
        ])

"Lately, I've been, I've been losin' sleep Dreamin' about the things that we could be But baby, I've been, I've been prayin' hard Said no more countin' dollars, we'll be countin' stars Lately, I've been, I've been losin' sleep Dreamin' about the things that we could be But baby, I've been, I've been prayin' hard Said no more countin' dollars, we'll be, we'll be countin' stars"

In [92]:
uniqueness("Lately, I've been, I've been losin' sleep Dreamin' about the things that we could be But baby, I've been, I've been prayin' hard Said no more countin' dollars, we'll be countin' stars Lately, I've been, I've been losin' sleep Dreamin' about the things that we could be But baby, I've been, I've been prayin' hard Said no more countin' dollars, we'll be, we'll be countin' stars")

0.2010214856990363

In [93]:
genre_dict

{'pop': ['Easy On Me by Adele',
  'Stay by The Kid LAROI & Justin Bieber',
  'Industry Baby by Lil Nas X & Jack Harlow',
  'Bad Habits by Ed Sheeran',
  'Shivers by Ed Sheeran',
  'Good 4 U by Olivia Rodrigo',
  'Need To Know by Doja Cat',
  'Levitating by Dua Lipa',
  'Essence by Wizkid Featuring Justin Bieber & Tems',
  'Kiss Me More by Doja Cat Featuring SZA',
  'Heat Waves by Glass Animals',
  'You Right by Doja Cat & The Weeknd',
  'Save Your Tears by The Weeknd & Ariana Grande',
  'Traitor by Olivia Rodrigo',
  'My Universe by Coldplay x BTS',
  'Meet Me At Our Spot by THE ANXIETY: WILLOW & Tyler Cole',
  'Montero (Call Me By Your Name) by Lil Nas X',
  'Chasing After You by Ryan Hurd With Maren Morris',
  'Moth To A Flame by Swedish House Mafia & The Weeknd',
  'Thats What I Want by Lil Nas X',
  'Happier Than Ever by Billie Eilish',
  'Better Days by NEIKED X Mae Muller X Polo G',
  'Ghost by Justin Bieber',
  'A-O-K by Tai Verdes',
  'Leave The Door Open by Silk Sonic (Bruno M

In [94]:
for genre, genre_cluster in genre_dict.items():
    genre_dict[genre] = create_dict_list_from_song_list(genre_cluster)




In [95]:
genre_dict

{'pop': [{'Easy On Me by Adele': {'readability': 0.7724,
    'uniqueness': 0.2010214856990363}},
  {'Stay by The Kid LAROI & Justin Bieber': {'readability': 0.2107,
    'uniqueness': 0.1351845727632272}},
  {'Industry Baby by Lil Nas X & Jack Harlow': {'readability': 0.275,
    'uniqueness': 0.17456676959568806}},
  {'Bad Habits by Ed Sheeran': {'readability': 0.6573,
    'uniqueness': 0.24119685985945924}},
  {'Shivers by Ed Sheeran': {'readability': 0.5457,
    'uniqueness': 0.18953333703726105}},
  {'Good 4 U by Olivia Rodrigo': {'readability': 0.6133,
    'uniqueness': 0.24824557610547643}},
  {'Need To Know by Doja Cat': {'readability': 0.6506000000000001,
    'uniqueness': 0.23035170601783383}},
  {'Levitating by Dua Lipa': {'readability': 0.6303,
    'uniqueness': 0.24028604959604133}},
  {'Essence by Wizkid Featuring Justin Bieber & Tems': {'readability': 0.4238,
    'uniqueness': 0.5190890286534412}},
  {'Kiss Me More by Doja Cat Featuring SZA': {'readability': 1.0,
    'uniqu

In [96]:
with open('data/unique_read.json', 'x') as json_file:
    json.dump(genre_dict, json_file)

In [118]:
uniqueness_dict = dict(zip(['pop', 'rap', 'rock'], [0,0,0]))
readability_dict = dict(zip(['pop', 'rap', 'rock'], [0,0,0]))

In [119]:
pop_counter = rap_counter = rock_counter = 0

for genre in ['pop', 'rap', 'rock']:
    for song in genre_dict[genre]:
        # print(genre_dict[genre])
        this_genre_list = [{'song': list(song.keys())[0], 'readability': list(song.values())[0]['readability'], 'uniqueness': list(song.values())[0]['uniqueness']} for song in genre_dict[genre]]
        # print(this_genre_dict)
        for song_dict in this_genre_list:
            readability_dict[genre] += song_dict['readability']
            if genre == 'pop':
                pop_counter += 1
            elif genre == 'rap':
                rap_counter += 1
            elif genre == 'rock':
                rock_counter += 1

In [120]:
readability_dict['pop'] /= pop_counter
readability_dict['rap'] /= rap_counter
readability_dict['rock'] /= rock_counter

In [122]:
readability_dict

{'pop': 0.5259932773108245,
 'rap': 0.5337738281249558,
 'rock': 0.5850028985507277}

In [116]:
readability

<function __main__.readability(chorus)>