In [13]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [22]:
def search(artist_list, checkpoint_interval=1000):
    artist_relations = {}
    searched_artist_names = {}
    checkpoint_dir = '22-3-checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_dir, 'checkpoint.pkl')
    start_index = 0

    if os.path.exists(checkpoint_file):
        checkpoint = pd.read_pickle(checkpoint_file)
        artist_relations = checkpoint['artist_relations']
        searched_artist_names = checkpoint['searched_artist_names']
        start_index = checkpoint['index']
        print(f"Resuming from checkpoint: {start_index}")

    for i, a in tqdm(enumerate(artist_list[start_index:]), total=len(artist_list[start_index:]), position=0, desc="Processing artists"):
        index = i + start_index

        aa = a.replace(" ", "+")
        url = f'https://inflooenz.com/?artist={aa}&submit=Search'
        try:
            page = requests.get(url)
            page.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error: {a} not found")
            continue

        soup = BeautifulSoup(page.content, 'html.parser')

        # Extract the searched artist name
        artist_name_tag = soup.find('h1')
        searched_artist_name = None
        if artist_name_tag:
            searched_artist_name_tag = artist_name_tag.find('span', {'class': 'artist', 'itemprop': 'name'})
            if searched_artist_name_tag:
                searched_artist_name = searched_artist_name_tag.text

        followers = []
        influencers = []

        followers_list = soup.find('ul', attrs={'class':'influences-list', 'id':'followers-list'})
        if followers_list:
            for follower in followers_list.find_all('li')[:-1]:
                if follower.text != "":
                    followers.append(follower.text)
        influencers_list = soup.find('ul', attrs={'class':'influences-list', 'id':'influencers-list'})
        if influencers_list:
            for influencer in influencers_list.find_all('li')[:-1]:
                if influencer.text != "":
                    influencers.append(influencer.text)

        if searched_artist_name:
            artist_relations[a] = {'followers': followers, 'influencers': influencers}
            searched_artist_names[a] = searched_artist_name
        else:
            artist_relations[a] = {'followers': followers, 'influencers': influencers}
            searched_artist_names[a] = a

        if index > 0 and index % checkpoint_interval == 0:
            checkpoint = {'artist_relations': artist_relations, 'searched_artist_names': searched_artist_names, 'index': index}
            pd.to_pickle(checkpoint, checkpoint_file)

    checkpoint = {'artist_relations': artist_relations, 'searched_artist_names': searched_artist_names, 'index': index}
    pd.to_pickle(checkpoint, checkpoint_file)

    return artist_relations, searched_artist_names

In [23]:
artists_df = pd.read_parquet('filtered_artists_with_infos_with_id.parquet')

artist_list = artists_df['Spotify Name'].tolist()

artist_relations, searched_artist_names = search(artist_list, checkpoint_interval=10)

# # Adding followers and influencers columns to the artists_df DataFrame
artists_df['followers'] = artists_df['Spotify Name'].apply(lambda x: artist_relations[x]['followers'] if x in artist_relations else [])
artists_df['influencers'] = artists_df['Spotify Name'].apply(lambda x: artist_relations[x]['influencers'] if x in artist_relations else [])
artists_df['searched_artist_name'] = artists_df['Spotify Name'].apply(lambda x: searched_artist_names[x] if x in searched_artist_names else x)

artists_df.to_parquet('artists_with_relations.parquet')


Processing artists:   2%|▏         | 198/10116 [05:49<19:47:25,  7.18s/it]

Error: Jessie Hill not found


Processing artists:  40%|████      | 4083/10116 [2:06:53<12:23:52,  7.40s/it]

Error: Terry Riley not found


Processing artists: 100%|█████████▉| 10105/10116 [5:34:26<01:20,  7.32s/it]  

Error: Alan Jackson not found


Processing artists: 100%|██████████| 10116/10116 [5:34:45<00:00,  1.99s/it]


In [24]:
artists_df

Artist,Spotify Name,Genres,Popularity,Spotify ID,followers,influencers,searched_artist_name
Loading... (need help?),,,,,,,
