# Adding photos to our data set -Extracting from Wikidata

In [101]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
df = pd.read_csv('2020_MUSIC.csv')

In [27]:
df

Unnamed: 0,track_name,artist,album
0,Blinding Lights,The Weeknd,After Hours
1,Dance Monkey,Tones And I,Dance Monkey
2,The Box,Roddy Ricch,Please Excuse Me For Being Antisocial
3,Roses,SAINt JHN,Roses
4,Don't Start Now,Dua Lipa,Future Nostalgia
5,ROCKSTAR,DaBaby,BLAME IT ON BABY
6,Watermelon Sugar,Harry Styles,Fine Line
7,death bed,Powfu,death bed
8,Falling,Trevor Daniel,Nicotine
9,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent


Getting url of each artist

In [28]:
import requests
def get_url(artist):
    API_ENDPOINT = 'https://www.wikidata.org/w/api.php'
    query = str(artist)
    params = { 'action' : 'wbsearchentities','format':'json','language':'en','search':query}
    r = requests.get(API_ENDPOINT, params= params)
    url = r.json()['search'][0]['url']
    return url

Getting the image url from each artist

In [29]:
import requests
from bs4 import BeautifulSoup
import urllib.request

final_links = []
for artist in df['artist']:
    url = get_url(artist)
    html = urllib.request.urlopen('https:'+url)
    soup = BeautifulSoup(html)
    images = soup.findAll('img')
    lista = []
    for image in images:
        link =image['src']
        lista.append(link)
        link_photo = lista[0]
    if 'upload' not in link_photo:
        final_links.append('NaN') #If there is no picture, do not append the link
    elif 'logo' in link_photo:
        final_links.append('NaN') #If the picture is a logo, do not append it
    else:
        final_links.append('http:'+link_photo)
    

Adding the picture to the data frame

In [30]:
df['image'] = final_links
image =df['image']

We tried to download the pictures as well, just in case we might need them

In [31]:
#downloading image
# import requests
# import shutil

# for person in df['artist']:
#     try:
#         link_photo = get_photo(person)
#         filename = link_photo.split("/")[-1]
#         resp = requests.get('https:'+link_photo, stream = True)
#         local_file = open(filename, 'wb')
#         resp.raw.decode_content = True
#         shutil.copyfileobj(resp.raw, local_file)
#         del resp
#         print(filename)
#     except:
#         print('Error downloading')

 We actually convert the empty cells in image to 'NaN'.

In [32]:
import numpy as np
df['image'].loc[df['image'] == 'NaN'] = np.nan
df

Unnamed: 0,track_name,artist,album,image
0,Blinding Lights,The Weeknd,After Hours,http://upload.wikimedia.org/wikipedia/commons/...
1,Dance Monkey,Tones And I,Dance Monkey,http://upload.wikimedia.org/wikipedia/commons/...
2,The Box,Roddy Ricch,Please Excuse Me For Being Antisocial,http://upload.wikimedia.org/wikipedia/commons/...
3,Roses,SAINt JHN,Roses,http://upload.wikimedia.org/wikipedia/commons/...
4,Don't Start Now,Dua Lipa,Future Nostalgia,
5,ROCKSTAR,DaBaby,BLAME IT ON BABY,http://upload.wikimedia.org/wikipedia/commons/...
6,Watermelon Sugar,Harry Styles,Fine Line,http://upload.wikimedia.org/wikipedia/commons/...
7,death bed,Powfu,death bed,
8,Falling,Trevor Daniel,Nicotine,
9,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,http://upload.wikimedia.org/wikipedia/commons/...


 To get an idea of the pictures, we visualize the data set with the pictures included

In [33]:
def path_to_image_html(path):
    return '<img src="'+ path + '" width="60" >'

In [34]:
df.to_html(escape=False, formatters=dict(image=path_to_image_html))

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>track_name</th>\n      <th>artist</th>\n      <th>album</th>\n      <th>image</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Blinding Lights</td>\n      <td>The Weeknd</td>\n      <td>After Hours</td>\n      <td><img src="http://upload.wikimedia.org/wikipedia/commons/thumb/b/b8/FEQ_July_2018_The_Weeknd_%2844778856382%29_%28cropped%29.jpg/220px-FEQ_July_2018_The_Weeknd_%2844778856382%29_%28cropped%29.jpg" width="60" ></td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Dance Monkey</td>\n      <td>Tones And I</td>\n      <td>Dance Monkey</td>\n      <td><img src="http://upload.wikimedia.org/wikipedia/commons/thumb/d/d4/Tones_And_I_%2850118162967%29_%28cropped%29.jpg/220px-Tones_And_I_%2850118162967%29_%28cropped%29.jpg" width="60" ></td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>The Box</td>\n      <td>Roddy Ricch</td>\n      <td>Ple

In [35]:
from IPython.core.display import HTML
HTML(df.to_html(escape=False,formatters=dict(image=path_to_image_html)))

Unnamed: 0,track_name,artist,album,image
0,Blinding Lights,The Weeknd,After Hours,
1,Dance Monkey,Tones And I,Dance Monkey,
2,The Box,Roddy Ricch,Please Excuse Me For Being Antisocial,
3,Roses,SAINt JHN,Roses,
4,Don't Start Now,Dua Lipa,Future Nostalgia,
5,ROCKSTAR,DaBaby,BLAME IT ON BABY,
6,Watermelon Sugar,Harry Styles,Fine Line,
7,death bed,Powfu,death bed,
8,Falling,Trevor Daniel,Nicotine,
9,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,


Some pictures still contain the logo of the singer, so we manually empty those cells, so that such pictures will not appear on the final data set. 

In [36]:
df.iloc[11,3] = np.nan
df.iloc[14,3] = np.nan
df.iloc[19,3] = np.nan
df.iloc[30,3] = np.nan
df.iloc[32,3] = np.nan
df.iloc[41,3] = np.nan

In [37]:
df

Unnamed: 0,track_name,artist,album,image
0,Blinding Lights,The Weeknd,After Hours,http://upload.wikimedia.org/wikipedia/commons/...
1,Dance Monkey,Tones And I,Dance Monkey,http://upload.wikimedia.org/wikipedia/commons/...
2,The Box,Roddy Ricch,Please Excuse Me For Being Antisocial,http://upload.wikimedia.org/wikipedia/commons/...
3,Roses,SAINt JHN,Roses,http://upload.wikimedia.org/wikipedia/commons/...
4,Don't Start Now,Dua Lipa,Future Nostalgia,
5,ROCKSTAR,DaBaby,BLAME IT ON BABY,http://upload.wikimedia.org/wikipedia/commons/...
6,Watermelon Sugar,Harry Styles,Fine Line,http://upload.wikimedia.org/wikipedia/commons/...
7,death bed,Powfu,death bed,
8,Falling,Trevor Daniel,Nicotine,
9,Someone You Loved,Lewis Capaldi,Divinely Uninspired To A Hellish Extent,http://upload.wikimedia.org/wikipedia/commons/...


Saving the dataframe to a final csv to be used to create the quizz 

In [38]:
df.to_csv('final_2020.csv', index = False)