In [1]:
import os
import pathlib

import django
import datetime
import dotenv
import pandas
from django.conf import settings

In [2]:
BASE_DIR = pathlib.Path('.')

In [3]:
dotenv.load_dotenv(BASE_DIR / '.env')

True

In [4]:
try:
    settings.configure(**{
        'DEBUG': True,
        'BASE_DIR': BASE_DIR,
        'INSTALLED_APPS': [
            'songs'
        ],
        'DATABASES': {
            'default': {
                'ENGINE': 'django.db.backends.sqlite3',
                'NAME': BASE_DIR / 'db.sqlite3',
            }
        },
        'MEDIA_PATH': BASE_DIR / 'media'
    })
except:
    pass

In [5]:
os.environ.setdefault('DJANGO_ALLOW_ASYNC_UNSAFE', 'true')

'true'

In [6]:
try:
    django.setup()
except:
    pass

In [10]:
import mwparserfromhell
import requests

In [8]:
from songs.models import Artist

In [9]:
artist = Artist.objects.get(id=123)

In [13]:
api_url = "https://fr.wikipedia.org/w/api.php"
params = {"action": "query", "prop": "revisions", "rvprop": "content", "format": "json", "titles": artist.name}
headers = {"User-Agent": "My BlindTest/1.0 (https://myblindtest.com; +https://myblindtest.com/api)"}
response = requests.get(api_url, params=params, headers=headers, timeout=15)

In [None]:
def try_extract_from_infobox(wikitext: str):
    """Parse wikitext with mwparserfromhell to extract birth_date or formation data."""
    try:
        mw = mwparserfromhell.parse(wikitext)
    except Exception:
        return None, None

    # Try to find template named 'Infobox' or 'Infobox musical artist' etc.
    for tpl in mw.filter_templates():
        name = tpl.name.strip().lower()
        print(name)
        if 'infobox' in name or 'biographie' in name or 'infobox chanteur' in name or 'artiste' in name:
            # Try birth_date / naissance / date_de_naissance
            for key in ['birth_date', 'date_de_naissance', 'naissance', 'date de naissance', 'née']:
                if tpl.has(key):
                    val = str(tpl.get(key).value).strip()
                    return val, None
                
            # formation/created/years_active
            for key in ['formation', 'fondation', 'fondée', 'years_active', 'années_actives', 'année_de_sortie']:
                if tpl.has(key):
                    val = str(tpl.get(key).value).strip()
                    return None, val

    # If we reach here, no clear infobox fields found
    return None, None

In [23]:
try_extract_from_infobox(response.content)

infobox biographie2\n
date de naissance
lien web
date de naissance
lien web
lien web
,
lien web
,
lien web
lien web
1er
lien web
lien web
lien web
date-
lien web
lien web
lien web
date-
lien web
date-
lien web
date-
lien web
!
lien web
date-
lien web
date-
lien web
date-
article
!
date-
date-
nobr
lien web
lien web
3e
citation
lien web
date-
9e
date-
date-
lien web
date-
lien web
lien web
r\u00e9f\u00e9rences
autres projets
liens
palette
portail
defaultsort:zenatti, julie


(None, None)

In [76]:
qs_artists = Artist.objects.filter(wikipedia_page='nan').values_list(
    'name', 'is_group', 'date_of_birth', 'wikipedia_page')

In [77]:
df_artists = pandas.DataFrame(list(qs_artists), columns=[
                              'name', 'is_group', 'date_of_birth', 'wikipedia_page'])

In [78]:
df_artists.describe()

Unnamed: 0,name,is_group,date_of_birth,wikipedia_page
count,62,62,24,62.0
unique,62,2,24,1.0
top,1T1,False,1997-06-15,
freq,1,56,1,62.0


In [79]:
df_artists.to_csv('artists_no_dob_wiki.csv', index=False)

In [70]:
ENRICHMENT_FILE = '/Users/johnpendenque/Downloads/enrich.csv'

In [58]:
df_enrichment = pandas.read_csv(ENRICHMENT_FILE)

In [59]:
df_enrichment.head()

Unnamed: 0,name,is_group,date_of_birth,wikipedia_page
0,1T1,False,,
1,Alexander,False,,
2,Bach,False,,
3,Bali Baby,False,1997-06-15,
4,BazBaz,False,,


In [60]:
from pandas import isna


for item in df_enrichment.itertuples():
    try:
        artist = Artist.objects.get(name=item.name)
    except:
        continue
    else:
        if isinstance(item.date_of_birth, str):
            try:
                d = datetime.datetime.strptime(item.date_of_birth, '%d-%m-%Y').date()
            except:
                d = datetime.datetime.strptime(item.date_of_birth, '%Y-%m-%d').date()

            artist.date_of_birth = d

        artist.wikipedia_page = item.wikipedia_page
        artist.is_group = item.is_group
        artist.save()