In [1]:
import gensim.downloader as api


In [2]:
#Step 1: Obtain top 100 billboard list
import billboard
chart = billboard.ChartData('hot-100')
songs = chart.entries

data = []
for i, song in enumerate(songs):
    data.append({'Rank': i+1, 'Song': song.title, 'Artist': song.artist})
df = pd.DataFrame(data)
df

In [None]:
def process_song(row):
    # Step 2: Obtain Lyrics
    genius_lyrics_link = get_genius_lyrics_link(row['Song'], row['Artist'])
    
    # Step 3: Obtain Youtube Link
    audio_link = get_youtube_audio_link(row['Song'], row['Artist'])
    
    # Step 4: Obtain downloaded audio file
    download_audio(audio_link, row['Rank'], row['Song'])
    
    # Step 6: For each link, obtain actual lyrics, clean and featurize
    lyrics = get_lyrics(genius_lyrics_link)
    
    # Step 7: Featurize the lyrics using a pre-trained word2vec model
    model = api.load('word2vec-google-news-300')
    lyrics = lyrics.lower()
    words = lyrics.split()
    vectors = []
    for word in words:
        if word in model:
            vector = model[word]
            vectors.append(vector)
    vectors = np.array(vectors)
    
    # Step 8: Featurize the audio file and the lyrics for each song
    y, sr = librosa.load(f'Audio Files/{row["Song"]}.mp4')
    
    # Compute time-varying audio features
    zcr = librosa.feature.zero_crossing_rate(y)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    flatness = librosa.feature.spectral_flatness(y=y)
    flux = librosa.onset.onset_strength(y=y, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

#     Transpose the feature arrays to get one row per time frame
#     zcr = zcr.T
#     centroid = centroid.T
#     bandwidth = bandwidth.T
#     contrast = contrast.T
#     rolloff = rolloff.T
#     flatness = flatness.T
#     flux = flux.T
#     mfccs = mfccs.T
#     chroma = chroma.T

#     Create a pandas DataFrame
#     df_features=pd.DataFrame(vectors.mean(axis=0),columns=[f'vectors_{i}' for i in range(vectors.shape[1])])
#     df_features['zcr'] = zcr.mean()
#     df_features['centroid'] = centroid.mean()
#     df_features['bandwidth'] = bandwidth.mean()
#     df_features[[f'contrast_{i}' for i in range(contrast.shape[1])]] = contrast.mean(axis=1)
#     df_features['rolloff'] = rolloff.mean()
#     df_features['flatness'] = flatness.mean()
#     df_features['flux'] = flux.mean()
#     df_features[[f'mfcc_{i}' for i in range(mfccs.shape[1])]] = mfccs.mean(axis=1)
#     df_features[[f'chroma_{i}' for i in range(chroma.shape[1])]] = chroma.mean(axis=1)

     os.remove(f'Audio Files/{row["Song"]}.mp4')
    
#     return df_features

df_processed_songs_list=[]
for i,row in df.iterrows():
      df_processed_songs_list.append(process_song(row))
df_processed_songs=pd.concat(df_processed_songs_list,axis=0)


In [3]:
#Step 2: 
#Obtain Lyrics
from googlesearch import search
import time
def get_genius_lyrics_link(song, artist):
    query = f'{song} {artist} genius lyrics'
    for url in search(query, num_results=1):
        sleep_time = random.uniform(.5,2)  
        time.sleep(sleep_time)
        return url




In [4]:
#Step 3: Obtain Youtube Link
from duckduckgo_search import DDGS
from itertools import islice


def get_youtube_audio_link(song, artist):
    with DDGS() as ddgs:
        query = f'{song} {artist} youtube audio'
        sleep_time = random.uniform(2, 10)  
        ddgs_gen = ddgs.text(query, backend="lite")
        for r in ddgs_gen:
            sleep_time = random.uniform(.5,2)  
            time.sleep(sleep_time)  # Sleep for the specified amount of time
            if 'youtube' in r['href'] and 'watch' in r['href']:
                return r['href']
    


In [None]:
#Step 4: Obtain downloaded audio file
from pytube import YouTube
import random
import time
import os

from pytube import YouTube
from pydub import AudioSegment

def download_audio(url, rank, song):
    try:
        yt = YouTube(url)
        stream = yt.streams.filter(only_audio=True).first()
        filename = f'{song}'
        stream.download(output_path='Audio Files', filename=filename)
        
        # Convert the downloaded file from mp4 to mp3
        audio = AudioSegment.from_file(f'Audio Files/{filename}.mp4', format="mp4")
        audio.export(f'Audio Files/{filename}.mp3', format="mp3")
    except Exception as e:
        print(f'Error downloading audio for rank {rank}: {e}')




In [5]:
#Step 6: For each link, obtain actual lyrics, clean and featurize

import time
import requests
import re
from bs4 import BeautifulSoup

def get_lyrics(url):
    # Set the URL of the song
    url = url

    # Send an HTTP GET request
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML response
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the lyrics element
        lyrics_element = soup.find('div', class_='Lyrics__Container-sc-1ynbvzw-5')

        # Get the lyrics text
        lyrics = lyrics_element.get_text(separator='\n')
        import re
        lyrics = re.sub(r'\[[^]]*\]', ' ', lyrics)
        
        # Return the lyrics
        
        return lyrics
    else:
        print(f'Failed to retrieve data: {response.status_code}')
        return None



In [None]:
# Step 9: Train machine learning model
import tensorflow as tf
# Convert the DataFrame to a 3D numpy array
audio_features = df.to_numpy().reshape(-1, df.shape[0], df.shape[1])

# Expand the word2vec vectors to have the same number of time steps
vectors_expanded = np.repeat(vectors[np.newaxis, :, :], audio_features.shape[1], axis=0)

# Transpose the first two axes of audio_features
audio_features = np.transpose(audio_features, (1, 0, 2))

# Expand the second dimension of audio_features to match vectors_expanded
audio_features = np.repeat(audio_features, vectors_expanded.shape[1], axis=1)

# Concatenate the word2vec vectors and audio features
X = np.concatenate([vectors_expanded, audio_features], axis=2)

# Define the input shape
input_shape = (X.shape[1], X.shape[2])

# Create a sequential model
model = tf.keras.Sequential()

# Add an LSTM layer with 16 units
model.add(tf.keras.layers.LSTM(16, input_shape=input_shape))

# Add a dropout layer with a rate of 0.5
model.add(tf.keras.layers.Dropout(0.5))

# Add a dense output layer with sigmoid activation
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# Compile the model with binary cross-entropy loss and the Adam optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Assume `labels` is a numpy array containing the corresponding labels

# Train the model on the concatenated features and labels
history = model.fit(X, labels, epochs=10, validation_split=0.2)
