In [None]:
import os
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_audio_vector(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def process_artist_folders(base_directory):
    audio_vectors = {}
    
    for artist in os.listdir(base_directory):
        artist_folder = os.path.join(base_directory, artist)
        if os.path.isdir(artist_folder):
            for file in os.listdir(artist_folder):
                if file.endswith('.mp3') or file.endswith('.wav'):
                    file_path = os.path.join(artist_folder, file)
                    audio_vector = get_audio_vector(file_path)
                    audio_vectors[file] = audio_vector
    
    return audio_vectors

def recommend_songs(target_song, audio_vectors):
    target_vector = audio_vectors[target_song]
    
    similarities = {}
    
    for song, vector in audio_vectors.items():
        if song != target_song:
            sim = cosine_similarity([target_vector], [vector])[0][0]
            similarities[song] = sim
            
    recommended_songs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    return recommended_songs

base_directory = r'D:\FSD_MINI_PROJECT\artists'
audio_vectors = process_artist_folders(base_directory)

target_song = 'Attention_trimmed.mp3' 
recommendations = recommend_songs(target_song, audio_vectors)

for song, similarity in recommendations:
    print(f"Recommended: {song} with similarity {similarity:.4f}")

In [None]:
import os
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def get_audio_vector(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def get_lyrics_vector(lyrics_file_path, vectorizer):
    with open(lyrics_file_path, 'r', encoding='utf-8') as f:
        lyrics = f.read()
    lyrics_vector = vectorizer.transform([lyrics]).toarray()[0]
    return lyrics_vector

def process_artist_folders(base_directory, vectorizer):
    audio_vectors = {}
    lyrics_vectors = {}
    
    for artist in os.listdir(base_directory):
        artist_folder = os.path.join(base_directory, artist)
        if os.path.isdir(artist_folder):
            for file in os.listdir(artist_folder):
                if file.endswith('.mp3') or file.endswith('.wav'):
                    file_path = os.path.join(artist_folder, file)
                    audio_vector = get_audio_vector(file_path)
                    audio_vectors[file] = audio_vector
                
                elif file.endswith('.txt'):  # Assuming lyrics are stored as .txt files
                    lyrics_file_path = os.path.join(artist_folder, file)
                    lyrics_vector = get_lyrics_vector(lyrics_file_path, vectorizer)
                    lyrics_vectors[file] = lyrics_vector
    
    return audio_vectors, lyrics_vectors

def combine_vectors(audio_vector, lyrics_vector, alpha=0.5):
    combined_vector = alpha * audio_vector + (1 - alpha) * lyrics_vector
    return combined_vector

def recommend_songs(target_song, audio_vectors, lyrics_vectors, alpha=0.5):
    target_audio_vector = audio_vectors[target_song.replace('.txt', '.mp3')]
    target_lyrics_vector = lyrics_vectors[target_song]
    target_vector = combine_vectors(target_audio_vector, target_lyrics_vector, alpha)
    
    similarities = {}
    
    for song, audio_vector in audio_vectors.items():
        lyrics_file = song.replace('.mp3', '.txt')
        if song != target_song and lyrics_file in lyrics_vectors:
            lyrics_vector = lyrics_vectors[lyrics_file]
            combined_vector = combine_vectors(audio_vector, lyrics_vector, alpha)
            sim = cosine_similarity([target_vector], [combined_vector])[0][0]
            similarities[song] = sim
            
    recommended_songs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    return recommended_songs

lyrics_folder = r'D:\FSD_MINI_PROJECT\artists'
corpus = []

for artist in os.listdir(lyrics_folder):
    artist_folder = os.path.join(lyrics_folder, artist)
    lyrics_subfolder = os.path.join(artist_folder, 'lyrics')
    
    if os.path.isdir(lyrics_subfolder):
        for file in os.listdir(lyrics_subfolder):
            if file.endswith('.txt'):
                file_path = os.path.join(lyrics_subfolder, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    corpus.append(f.read()) 

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

base_directory = r'D:\FSD_MINI_PROJECT\artists'
audio_vectors, lyrics_vectors = process_artist_folders(base_directory, vectorizer)

target_song = 'Attention.txt' 
recommendations = recommend_songs(target_song, audio_vectors, lyrics_vectors, alpha=0.5)

for song, similarity in recommendations:
    print(f"Recommended: {song} with similarity {similarity:.4f}")


In [None]:
import os
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

def get_audio_vector(file_path):
    y, sr = librosa.load(file_path, sr=None)
    # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    combined_features = np.hstack([mel, chroma, contrast, tonnetz])
    return combined_features

def get_lyrics_vector(lyrics_file_path, vectorizer):
    with open(lyrics_file_path, 'r', encoding='utf-8') as f:
        lyrics = f.read()
    lyrics_vector = vectorizer.transform([lyrics]).toarray()[0]
    return lyrics_vector

def process_artist_folders(base_directory, vectorizer):
    audio_vectors = {}
    lyrics_vectors = {}
    
    for artist in os.listdir(base_directory):
        artist_folder = os.path.join(base_directory, artist)
        lyrics_subfolder = os.path.join(artist_folder, 'lyrics')
        
        if os.path.isdir(artist_folder):
            for file in os.listdir(artist_folder):
                if file.endswith('.mp3') or file.endswith('.wav'):
                    file_path = os.path.join(artist_folder, file)
                    base_name = os.path.splitext(file)[0]
                    audio_vector = get_audio_vector(file_path)
                    audio_vectors[base_name] = audio_vector
                
        if os.path.isdir(lyrics_subfolder):
            for file in os.listdir(lyrics_subfolder):
                if file.endswith('.txt'):
                    lyrics_file_path = os.path.join(lyrics_subfolder, file)
                    base_name = os.path.splitext(file)[0]
                    lyrics_vector = get_lyrics_vector(lyrics_file_path, vectorizer)
                    lyrics_vectors[base_name] = lyrics_vector
    
    return audio_vectors, lyrics_vectors

from sklearn.decomposition import TruncatedSVD

def reduce_vector_size(vector, target_size):
    if len(vector) > target_size:
        svd = TruncatedSVD(n_components=target_size)
        vector = svd.fit_transform([vector])[0]
    return vector

def combine_vectors(audio_vector, lyrics_vector, alpha):
    # Reduce the lyrics vector size to match the audio vector size (13)
    lyrics_vector = reduce_vector_size(lyrics_vector, len(audio_vector))
    # Combine audio and lyrics vectors (weighted average)
    combined_vector = alpha * audio_vector + (1 - alpha) * lyrics_vector
    return combined_vector

def recommend_songs(target_song, audio_vectors, lyrics_vectors, alpha):
    base_name = os.path.splitext(target_song)[0] 
    
    target_audio_vector = audio_vectors.get(base_name)
    target_lyrics_vector = lyrics_vectors.get(base_name)
    
    if target_audio_vector is None or target_lyrics_vector is None:
        raise KeyError(f"Missing data for target song: {target_song}")
    
    target_vector = combine_vectors(target_audio_vector, target_lyrics_vector, alpha)
    
    similarities = {}
    
    for song, audio_vector in audio_vectors.items():
        lyrics_vector = lyrics_vectors.get(song) 
        if song != base_name and lyrics_vector is not None:
            combined_vector = combine_vectors(audio_vector, lyrics_vector, alpha)
            sim = cosine_similarity([target_vector], [combined_vector])[0][0]
            similarities[song] = sim
            
    recommended_songs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    return recommended_songs
lyrics_folder = r'D:\FSD_MINI_PROJECT\artists'
corpus = []

for artist in os.listdir(lyrics_folder):
    artist_folder = os.path.join(lyrics_folder, artist)
    lyrics_subfolder = os.path.join(artist_folder, 'lyrics')
    
    if os.path.isdir(lyrics_subfolder):
        for file in os.listdir(lyrics_subfolder):
            if file.endswith('.txt'):
                file_path = os.path.join(lyrics_subfolder, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    corpus.append(f.read()) 

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)


base_directory = r'D:\FSD_MINI_PROJECT\artists'
audio_vectors, lyrics_vectors = process_artist_folders(base_directory, vectorizer)

target_song = 'Attention.txt' 
recommendations = recommend_songs(target_song, audio_vectors, lyrics_vectors, alpha=0.5)

for song, similarity in recommendations:
    print(f"Recommended: {song}.mp3 with similarity {similarity:.4f}")


In [None]:
audio_vectors

In [None]:
lyrics_vectors

In [None]:
import json
import os
from hdfs import InsecureClient

client = InsecureClient('http://localhost:9870', user='adity')

def convert_ndarray_to_list(data):
    if isinstance(data, dict):
        return {k: convert_ndarray_to_list(v) for k, v in data.items()}
    elif isinstance(data, np.ndarray):
        return data.tolist()
    else:
        return data

def upload_to_hdfs(data, hdfs_path, file_name):
    local_temp_file = f'{file_name}.json'
    data_converted = convert_ndarray_to_list(data)
    with open(local_temp_file, 'w') as f:
        json.dump(data_converted, f)
    # hdfs_file_path = os.path.join(hdfs_path, f'{file_name}.json')
    # client.upload(hdfs_file_path, local_temp_file)
upload_to_hdfs(audio_vectors, '/song_features', 'audio_vectors')
upload_to_hdfs(lyrics_vectors, '/song_features', 'lyrics_vectors')


In [None]:
import json

def load_vectors_from_json(audio_json_path, lyrics_json_path):
    # Load audio vectors
    with open(audio_json_path, 'r') as audio_file:
        audio_vectors = json.load(audio_file)

    # Load lyrics vectors
    with open(lyrics_json_path, 'r') as lyrics_file:
        lyrics_vectors = json.load(lyrics_file)

    return audio_vectors, lyrics_vectors

def convert_list_to_ndarray(data):
    if isinstance(data, dict):
        return {k: convert_list_to_ndarray(v) for k, v in data.items()}
    elif isinstance(data, list):
        return np.array(data) 
    else:
        return data


In [None]:
audio_json_path = 'audio_vectors.json'
lyrics_json_path = 'lyrics_vectors.json'
with open(lyrics_json_path, 'r') as f:
    data_lyrics = json.load(f)
lyrics_vectors = convert_list_to_ndarray(data_lyrics)

with open(audio_json_path, 'r') as f:
    data_audio = json.load(f)
audio_vectors = convert_list_to_ndarray(data_audio)

target_song = 'Blinding Lights'
alpha = 0.5 
try:
    recommendations = recommend_songs(target_song, audio_vectors, lyrics_vectors, alpha=0.5)
    print("Recommended Songs:")
    for song, score in recommendations:
        print(f"{song}: {score}")
except KeyError as e:
    print(e)
