In [1]:
from hdfs import InsecureClient
import hdfs
import json

client = InsecureClient('http://localhost:9870', user='hdfs')

In [2]:
import librosa
import numpy as np
def get_audio_vector(file_path):
    y, sr = librosa.load(file_path, sr=None)
    # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    combined_features = np.hstack([mel, chroma, contrast, tonnetz])
    return combined_features

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = []
def get_lyrics_vector(lyrics_file_path):
    vectorizer = TfidfVectorizer()
    with open(lyrics_file_path, 'r', encoding='utf-8') as f:
        lyrics = f.read()
        corpus.append(lyrics) 
    vectorizer.fit(corpus)
    lyrics_vector = vectorizer.transform([lyrics]).toarray()[0]
    return lyrics_vector

In [4]:
lyrics_vectors = {}
audio_vectors = {}
def process_new_audio_lyrics(audio_path, lyrics_path):
    audio_vector = get_audio_vector(audio_path)
    lyrics_vector = get_lyrics_vector(lyrics_path)
    audio_name = audio_path.split("\\")[-1].replace(".mp3", "")
    lyrics_name = lyrics_path.split("\\")[-1].replace(".txt", "")
    lyrics_vectors[lyrics_name] = lyrics_vector
    audio_vectors[audio_name] = audio_vector
    return audio_vectors, lyrics_vectors

In [5]:
audio_vectors, lyrics_vectors = process_new_audio_lyrics(r"D:\FSD_MINI_PROJECT\queen\Bohemian Rhapsody.mp3", r"D:\FSD_MINI_PROJECT\queen\lyrics\Bohemian Rhapsody.txt")

In [6]:
audio_vectors, lyrics_vectors

({'Bohemian Rhapsody': array([ 6.09590113e-03,  1.03906039e-02,  3.04836053e-02,  1.19622551e-01,
          2.17081904e-01,  3.19116950e-01,  1.82764828e+00,  3.08535290e+00,
          4.36979580e+00,  4.70543671e+00,  4.22294331e+00,  8.90342236e+00,
          8.12477589e+00,  3.20686007e+00,  5.70249939e+00,  2.39161015e+00,
          1.20780551e+00,  1.80049551e+00,  3.44644022e+00,  3.50871110e+00,
          1.69209445e+00,  4.97614288e+00,  3.17243671e+00,  6.29165053e-01,
          2.39625001e+00,  7.25278795e-01,  7.40950704e-01,  2.04781628e+00,
          2.79257941e+00,  3.64835596e+00,  4.59038734e-01,  4.53052163e-01,
          2.20671225e+00,  1.89988875e+00,  4.95699137e-01,  1.17468953e+00,
          3.57594919e+00,  1.22280836e+00,  5.21957517e-01,  1.32475853e-01,
          1.69203684e-01,  8.90788078e-01,  9.64829743e-01,  3.30497205e-01,
          3.05466652e-01,  1.13907003e+00,  4.51401383e-01,  1.42448768e-01,
          5.43589234e-01,  5.74684381e-01,  4.01491046e

In [7]:
def convert_ndarray_to_list(data):
    if isinstance(data, dict):
        return {k: convert_ndarray_to_list(v) for k, v in data.items()}
    elif isinstance(data, np.ndarray):
        return data.tolist()
    else:
        return data


In [8]:
audio_vectors = convert_ndarray_to_list(audio_vectors)
lyrics_vectors = convert_ndarray_to_list(lyrics_vectors)

In [9]:
def save_to_json(data, file_name):
    local_temp_file = f'{file_name}.json'
    data_converted = convert_ndarray_to_list(data)
    with open(local_temp_file, 'w') as f:
        json.dump(data_converted, f)

In [10]:
save_to_json(audio_vectors, 'new_audio_vectors')
save_to_json(lyrics_vectors, 'new_lyrics_vectors')

In [11]:
def read_json_from_hdfs(hdfs_path):
    with client.read(hdfs_path, encoding='utf-8') as reader:
        return json.load(reader)

In [12]:
audio_vectors_path = '/bdt_vectors/audio_vectors.json'
lyrics_vectors_path = '/bdt_vectors/lyrics_vectors.json'

In [13]:
audio_vectors = read_json_from_hdfs(audio_vectors_path)
lyrics_vectors = read_json_from_hdfs(lyrics_vectors_path)

In [14]:
with open('new_audio_vectors.json','r') as f:
    data_audio = json.load(f)
with open('new_lyrics_vectors.json') as f:
    data_lyrics = json.load(f)

In [15]:
audio_vectors.update(data_audio)
lyrics_vectors.update(data_lyrics)

In [16]:
with open('audio_vectors.json', 'w') as f:
    json.dump(audio_vectors, f)
with open('lyrics_vectors.json', 'w') as f:
    json.dump(lyrics_vectors, f)

In [17]:
def write_json_to_hdfs(path, combined):
    with open(combined, 'r') as f:
        combined = json.load(f)
    
    combined = json.dumps(combined)
    with client.write(path, overwrite=True) as writer:
        writer.write(combined)

In [18]:
write_json_to_hdfs(audio_vectors_path, 'audio_vectors.json')

In [19]:
write_json_to_hdfs(lyrics_vectors_path, 'lyrics_vectors.json')

In [42]:
def read_json_from_hdfs(client, hdfs_path):
    with client.read(hdfs_path, encoding='utf-8') as reader:
        return json.load(reader)

In [43]:
def write_json_to_hdfs(client, path, combined):
    with open(combined, 'r') as f:
        combined = json.load(f)
    
    combined = json.dumps(combined)
    with client.write(path, overwrite=True) as writer:
        writer.write(combined)

In [46]:
import os
audio_vectors = {}
lyrics_vectors = {}
def ingest_data(artist_path):
    client = InsecureClient('http://localhost:9870', user='hdfs')
    audio_vectors_path = '/bdt_vectors/audio_vectors.json'
    lyrics_vectors_path = '/bdt_vectors/lyrics_vectors.json'
    song_files = []
    lyrics_files = []
    artist_path = r"D:\FSD_MINI_PROJECT\queen"
    for song in os.listdir(artist_path):
        if song.endswith('.mp3') or song.endswith('.wav'):
            song_files.append(os.path.join(artist_path, song))
        else:
            lyrics_path = os.path.join(artist_path, song)
            for file in os.listdir(lyrics_path):
                if file.endswith(".txt"):
                    lyrics_files.append(os.path.join(lyrics_path, file))

    for k,v in dict(zip(song_files, lyrics_files)).items():
        audio_vectors, lyrics_vectors = process_new_audio_lyrics(k, v)
        audio_vectors = convert_ndarray_to_list(audio_vectors)
        lyrics_vectors = convert_ndarray_to_list(lyrics_vectors)
        save_to_json(audio_vectors, 'new_audio_vectors')
        save_to_json(lyrics_vectors, 'new_lyrics_vectors')
        audio_vectors = read_json_from_hdfs(client, audio_vectors_path)
        lyrics_vectors = read_json_from_hdfs(client, lyrics_vectors_path)
        with open('new_audio_vectors.json','r') as f:
            data_audio = json.load(f)
        with open('new_lyrics_vectors.json') as f:
            data_lyrics = json.load(f)
        audio_vectors.update(data_audio)
        lyrics_vectors.update(data_lyrics)
        with open('audio_vectors.json', 'w') as f:
            json.dump(audio_vectors, f)
        with open('lyrics_vectors.json', 'w') as f:
            json.dump(lyrics_vectors, f)
        write_json_to_hdfs(client, audio_vectors_path, 'audio_vectors.json')
        write_json_to_hdfs(client, lyrics_vectors_path, 'lyrics_vectors.json')
        print("Written!")


        

In [47]:
ingest_data("D:\FSD_MINI_PROJECT\queen")

Written!
Written!
Written!
Written!
Written!
