In [1]:
import os
import pathlib

import django
import dotenv
import pandas
from django.conf import settings

In [2]:
BASE_DIR = pathlib.Path('.')

In [3]:
dotenv.load_dotenv(BASE_DIR / '.env')

True

In [4]:
try:
    settings.configure(**{
        'DEBUG': True,
        'BASE_DIR': BASE_DIR,
        'INSTALLED_APPS': [
            'songs'
        ],
        'DATABASES': {
            'default': {
                'ENGINE': 'django.db.backends.sqlite3',
                'NAME': BASE_DIR / 'db.sqlite3',
            }
        },
        'MEDIA_PATH' : BASE_DIR / 'media'
    })
except:
    pass

In [5]:
os.environ.setdefault('DJANGO_ALLOW_ASYNC_UNSAFE', 'true')

'true'

In [6]:
try:
    django.setup()
except:
    pass

## No Wikipedia

Clean the data on the Wikipedia url column that contains "nan"

In [8]:
from songs.models import Song, Artist
from django.db.models import Q

In [11]:
qs_no_wiki = Artist.objects.filter(Q(wikipedia_page__isnull=True) | Q(wikipedia_page='') | Q(wikipedia_page='nan'))

In [12]:
qs_no_wiki.count()

61

In [13]:
no_wiki_df = pandas.DataFrame(list(qs_no_wiki.values('name', 'wikipedia_page')))

In [14]:
no_wiki_df.head()

Unnamed: 0,name,wikipedia_page
0,1T1,
1,Alexander,
2,Bali Baby,
3,BazBaz,
4,Bhad Bhabie,


In [15]:
for item in no_wiki_df.itertuples():
    artist = Artist.objects.get(name=item.name)
    artist.wikipedia_page = None
    artist.save()

In [8]:
df = pandas.read_csv(settings.MEDIA_PATH / 'base_artists_v2.csv')

In [9]:
df = df.sort_values('name')

In [10]:
no_duplicates_df = df.drop_duplicates(['name'])

In [12]:
# no_duplicates_df.to_csv(settings.MEDIA_PATH / 'base_artists_v2.csv', index=False)

## Completing Wikipedia pages

In [43]:
qs_artists = Artist.objects.filter(wikipedia_page='nan').values_list('name', 'is_group', 'date_of_birth', 'wikipedia_page')

In [44]:
df_artists = pandas.DataFrame(list(qs_artists), columns=['name', 'is_group', 'date_of_birth', 'wikipedia_page'])    

In [45]:
df_artists.describe()

Unnamed: 0,name,is_group,date_of_birth,wikipedia_page
count,382,382,180,382.0
unique,382,1,177,1.0
top,1T1,False,2003-03-26,
freq,1,382,2,382.0


In [52]:
df_artists.iloc[:150].to_csv('artists_no_dob_wiki.csv', index=False)

## Completing Spotify metadata

In [16]:
import time
from blindtest.rapidapi.client import Spotify

In [None]:
qs = Song.objects.order_by('artist').filter(
    genre__icontains='disco', 
    spotify_id__isnull=True
).values_list('artist', flat=True)

unique_artists = set(qs)
len(unique_artists)

# Song.objects.filter(spotify_id__isnull=True).values_list('genre', flat=True)

3

In [44]:
def spotify_data(genre):
    qs = Song.objects.order_by('artist').filter(genre__icontains=genre, spotify_id__isnull=True).values_list('artist', flat=True)
    unique_artists = set(qs)

    print(f'Searching for {len(unique_artists)}')

    for name in unique_artists:
        instance = Spotify(name)

        try:
            instance.send()
        except:
            time.sleep(5)
            continue

        print(f'Got: {name}')

        try:
            data = instance[0]['data']
        except:
            print(f'Failed to get {name}')
            time.sleep(5)
            continue
        else:
            qs2 = Song.objects.filter(artist=name)
            spotify_id = data['uri'].split(':')[-1]
            
            try:
                avatar = data['visuals']['avatarImage']['sources'][0]['url']
            except:
                avatar = None

            qs2.update(spotify_id=spotify_id, spotify_avatar=avatar)

        time.sleep(5)

In [55]:
spotify_data('disco')

Searching for 3
Got: Los Del Rio
Got: Daft Punk
Got: Village People


## Creating Windows

In [10]:
from django.db.models.functions.window import Rank
from django.db.models import Window, F, Count

In [None]:
qs = Song.objects.all()

In [185]:
genre_count = Song.objects.values('genre').annotate(count=Count('genre')).order_by('-count')

In [186]:
genre_df = pandas.DataFrame(genre_count)
genre_df.head()

Unnamed: 0,genre,count
0,Pop rock,32
1,Rhythm and blues,5
2,Rap music,3
3,Electropop,2
4,Techno,1


In [191]:
window = Window(Rank(), order_by='-count')
window_genre = Song.objects.values('genre').annotate(count=Count('genre')).annotate(rank=window).order_by('-count')

In [192]:
df = pandas.DataFrame(window_genre)

In [195]:
df = df.sort_values(by='rank', ascending=True)

In [196]:
df

Unnamed: 0,genre,count,rank
0,Pop rock,32,1
1,Rhythm and blues,5,2
2,Rap music,3,3
3,Electropop,2,4
4,Techno,1,5
5,Soft rock,1,5
6,Ragga,1,5
7,Pop soul,1,5
8,Indie pop,1,5
9,Gothic rock,1,5
