In [None]:
# import libraries, load env variables

In [68]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from annoy import AnnoyIndex
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import musicbrainzngs
import requests

# Load environment variables
load_dotenv()

LASTFM_API_KEY = os.getenv('LASTFM_API_KEY')
SPOTIPY_CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')
MUSICBRAINZ_CLIENT_ID = os.getenv('MUSICBRAINZ_CLIENT_ID')
MUSICBRAINZ_CLIENT_SECRET = os.getenv('MUSICBRAINZ_CLIENT_SECRET')

# Initialize Spotipy with OAuth
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID,
                                                           client_secret=SPOTIPY_CLIENT_SECRET))

# MusicBrainz API setup
musicbrainzngs.set_useragent("Minerva", "0.1", "daniel.marc.steinberg@gmail.com")


In [69]:
# def data collection funcs

In [70]:
def collect_lastfm_data(artist_name):
    url = f"http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist={artist_name}&api_key={LASTFM_API_KEY}&format=json"
    response = requests.get(url)
    data = response.json()
    if 'artist' not in data:
        print(f"No data found for {artist_name} on Last.fm.")
        return None
    artist_data = {
        'name': data['artist']['name'],
        'listeners': int(data['artist']['stats']['listeners']),
        'playcount': int(data['artist']['stats']['playcount']),
        'bio': data['artist']['bio']['summary']
    }
    return artist_data

def collect_musicbrainz_data(artist_name):
    result = musicbrainzngs.search_artists(artist=artist_name)
    if not result['artist-list']:
        print(f"No data found for {artist_name} on MusicBrainz.")
        return []
    artist_id = result['artist-list'][0]['id']
    
    # Get artist's releases
    releases = musicbrainzngs.browse_releases(artist=artist_id, includes=["recordings"], limit=5)
    release_data = []
    for release in releases['release-list']:
        release_data.append({
            'title': release['title'],
            'date': release['date'],
            'country': release.get('country', 'Unknown')
        })
    return release_data

def collect_spotify_data(artist_name):
    results = sp.search(q='artist:' + artist_name, type='artist')
    if not results['artists']['items']:
        print(f"No data found for {artist_name} on Spotify.")
        return []
    artist = results['artists']['items'][0]
    artist_id = artist['id']
    
    # Get artist's albums
    albums = sp.artist_albums(artist_id, album_type='album')
    album_data = []
    for album in albums['items']:
        album_id = album['id']
        tracks = sp.album_tracks(album_id)['items']
        track_ids = [track['id'] for track in tracks]
        
        # Get audio features for all tracks in the album
        if track_ids:
            album_features = sp.audio_features(track_ids)
            danceability = sum(track['danceability'] for track in album_features if track and 'danceability' in track) / len(album_features)
            energy = sum(track['energy'] for track in album_features if track and 'energy' in track) / len(album_features)
            valence = sum(track['valence'] for track in album_features if track and 'valence' in track) / len(album_features)
            tempo = sum(track['tempo'] for track in album_features if track and 'tempo' in track) / len(album_features)
            acousticness = sum(track['acousticness'] for track in album_features if track and 'acousticness' in track) / len(album_features)
            instrumentalness = sum(track['instrumentalness'] for track in album_features if track and 'instrumentalness' in track) / len(album_features)
            liveness = sum(track['liveness'] for track in album_features if track and 'liveness' in track) / len(album_features)
            speechiness = sum(track['speechiness'] for track in album_features if track and 'speechiness' in track) / len(album_features)
        else:
            danceability = 0
            energy = 0
            valence = 0
            tempo = 0
            acousticness = 0
            instrumentalness = 0
            liveness = 0
            speechiness = 0
        
        # Collect popularity of each track and compute the average
        track_popularities = [track['popularity'] for track in tracks if 'popularity' in track]
        if track_popularities:
            popularity = sum(track_popularities) / len(track_popularities)
        else:
            popularity = 0
        
        album_data.append({
            'album_name': album['name'],
            'release_date': album['release_date'],
            'total_tracks': album['total_tracks'],
            'spotify_url': album['external_urls']['spotify'],
            'danceability': danceability,
            'energy': energy,
            'valence': valence,
            'tempo': tempo,
            'acousticness': acousticness,
            'instrumentalness': instrumentalness,
            'liveness': liveness,
            'speechiness': speechiness,
            'popularity': popularity,
            'genre': ", ".join(artist.get('genres', []))
        })
    return album_data


In [76]:
def collect_data(artist_name):
    print(f"Collecting data for {artist_name}...")
    
    # Create necessary directories
    os.makedirs('data/raw', exist_ok=True)
    
    lastfm_data = collect_lastfm_data(artist_name)
    musicbrainz_data = collect_musicbrainz_data(artist_name)
    spotify_data = collect_spotify_data(artist_name)
    
    # Save data to CSV files only if data is not None or empty
    if lastfm_data:
        lastfm_df = pd.DataFrame([lastfm_data])
        lastfm_df.to_csv('data/raw/lastfm_data.csv', index=False)
    
    if musicbrainz_data:
        musicbrainz_df = pd.DataFrame(musicbrainz_data)
        musicbrainz_df.to_csv('data/raw/musicbrainz_data.csv', index=False)
    
    if spotify_data:
        spotify_df = pd.DataFrame(spotify_data)
        spotify_df.to_csv('data/raw/spotify_data.csv', index=False)
        
# Collect data for an example artist
collect_data("Elis Regina")

Collecting data for Elis Regina...


In [77]:
def process_data():
    print("Processing collected data...")
    
    # Create necessary directories
    os.makedirs('data/processed', exist_ok=True)
    
    # Load raw data
    lastfm_df = pd.read_csv('data/raw/lastfm_data.csv')
    musicbrainz_df = pd.read_csv('data/raw/musicbrainz_data.csv')
    spotify_df = pd.read_csv('data/raw/spotify_data.csv')
    
    # Normalize artist names
    lastfm_df['name'] = lastfm_df['name'].str.lower().str.strip()
    musicbrainz_df['name'] = musicbrainz_df['title'].str.lower().str.strip()
    spotify_df['name'] = spotify_df['album_name'].str.lower().str.strip()
    
    # Merge datasets
    combined_df = pd.concat([lastfm_df, musicbrainz_df, spotify_df], axis=1)
    
    # Handle missing values
    combined_df.fillna(0, inplace=True)
    
    # Ensure columns exist
    feature_columns = ['listeners', 'playcount', 'danceability', 'energy', 'valence', 'popularity', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']
    for col in feature_columns:
        if col not in combined_df:
            combined_df[col] = 0
    
    # Extract and scale features
    scaler = StandardScaler()
    combined_df[feature_columns] = scaler.fit_transform(combined_df[feature_columns])
    
    # Save processed data
    combined_df.to_csv('data/processed/processed_data.csv', index=False)
    
    # Define the number of features globally
    global num_features
    num_features = len(feature_columns)

# Process the collected data
process_data()



Processing collected data...


In [80]:
def build_annoy_index():
    print("Building Annoy index...")

    # Ensure the models directory exists
    os.makedirs('models', exist_ok=True)

    # Load processed data
    combined_df = pd.read_csv('data/processed/processed_data.csv')

    # Extract features for Annoy
    feature_columns = ['listeners', 'playcount', 'danceability', 'energy', 'valence', 'popularity', 'tempo', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']
    features = combined_df[feature_columns].fillna(0).values

    # Initialize Annoy index
    index = AnnoyIndex(num_features, 'angular')

    # Add items to the index
    for i, feature_vector in enumerate(features):
        index.add_item(i, feature_vector)

    # Build the index
    index.build(10)  # Number of trees
    index.save('models/annoy_index.ann')

# Build the Annoy index
build_annoy_index()



Building Annoy index...


In [79]:
def generate_annoy_recommendations(item_id, num_recommendations=10):
    # Load the Annoy index
    index = AnnoyIndex(num_features, 'angular')
    index.load('models/annoy_index.ann')
    
    # Get nearest neighbors
    nearest_neighbors = index.get_nns_by_item(item_id, num_recommendations + 1)
    
    # Load processed data
    combined_df = pd.read_csv('data/processed/processed_data.csv')
    
    # Get recommendations
    recommendations = combined_df.iloc[nearest_neighbors]
    
    return recommendations

def get_recommendations(artist_name, num_recommendations=10):
    # Load processed data
    combined_df = pd.read_csv('data/processed/processed_data.csv')

    # Find the index of the artist
    artist_index = combined_df.index[combined_df['name'] == artist_name.lower().strip()].tolist()
    if not artist_index:
        print(f"Artist {artist_name} not found in the dataset.")
        return

    # Generate recommendations
    recommendations = generate_annoy_recommendations(artist_index[0], num_recommendations)
    print("Recommendations:\n", recommendations)

# Example usage
get_recommendations("Elis Regina", 5)


Recommendations:
            name  listeners  playcount  \
0   elis regina   4.358899   4.358899   
2             0  -0.229416  -0.229416   
19            0  -0.229416  -0.229416   
4             0  -0.229416  -0.229416   
15            0  -0.229416  -0.229416   
11            0  -0.229416  -0.229416   

                                                  bio                  title  \
0   Elis Regina Carvalho Costa (March 17, 1945, Po...     Viva a Brotolândia   
2                                                   0           Ellis Regina   
19                                                  0                      0   
4                                                   0  Samba, eu canto assim   
15                                                  0                      0   
11                                                  0                      0   

      date country                 name.1                         album_name  \
0   1961.0      BR     viva a brotolândia            

In [57]:
'''
heres the raw output of our first test. undefined. 
'''

'\n\nheres the raw output of our first test. undefined. \n\n'

In [58]:
collect_data("Elis Regina")

Collecting data for Elis Regina...


In [45]:
process_data()

Processing collected data...


In [46]:
build_annoy_index()

Building Annoy index...


In [50]:
get_recommendations("Elis Regina", 5)

Recommendations:
            name  listeners  playcount  \
0   elis regina   4.358899   4.358899   
2             0  -0.229416  -0.229416   
19            0  -0.229416  -0.229416   
4             0  -0.229416  -0.229416   
15            0  -0.229416  -0.229416   
11            0  -0.229416  -0.229416   

                                                  bio                  title  \
0   Elis Regina Carvalho Costa (March 17, 1945, Po...     Viva a Brotolândia   
2                                                   0           Ellis Regina   
19                                                  0                      0   
4                                                   0  Samba, eu canto assim   
15                                                  0                      0   
11                                                  0                      0   

      date country                 name.1                         album_name  \
0   1961.0      BR     viva a brotolândia            