# Spotify Web Scrapping w Python

### Scrapping from wikipedia

In [139]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [140]:
# Access HTML
url = 'https://en.wikipedia.org/wiki/List_of_most-streamed_songs_on_Spotify'
headers = {'User-Agent': 'MyWebScrapper/1.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [141]:
# Subset desired most streamed songs table
tables = soup.find_all('table', class_='wikitable')
most_streamed_songs = tables[0]
most_streamed_songs.find_all('th')[:5]

[<th>Rank
 </th>,
 <th>Song
 </th>,
 <th>Artist(s)
 </th>,
 <th>Streams<br/>(billions)
 </th>,
 <th>Release date
 </th>]

In [142]:
# Extract table headers into table_headers
header_cells = most_streamed_songs.find_all('tr')[0]
table_headers = [cell.get_text(strip=True) for cell in header_cells if cell.get_text(strip=False) != '\n']
table_headers

['Rank', 'Song', 'Artist(s)', 'Streams(billions)', 'Release date', 'Ref.']

In [143]:
# Extract table rows into a table_data
rows = most_streamed_songs.find_all('tr')[1:]

table_data = []
for row in rows:
    song_name = row.find('th').get_text(strip=True).replace('"', '')
    cells = row.find_all('td')

    tmp_content = [cell.get_text(strip=True) for cell in cells]
    tmp_content.insert(1, song_name)

    table_data.append(tmp_content)
    
table_data[:5]

[['1', 'Blinding Lights', 'The Weeknd', '5.000', '29 November 2019', '[1]'],
 ['2', 'Shape of You', 'Ed Sheeran', '4.528', '6 January 2017', '[2]'],
 ['3',
  'Starboy',
  'The WeekndandDaft Punk',
  '4.063',
  '21 September 2016',
  '[3]'],
 ['4',
  'Someone You Loved',
  'Lewis Capaldi',
  '4.032',
  '8 November 2018',
  '[4]'],
 ['5', 'As It Was', 'Harry Styles', '4.021', '1 April 2022', '[5]']]

In [144]:
df_raw = pd.DataFrame(data=table_data, columns=table_headers)
df_raw.head()

Unnamed: 0,Rank,Song,Artist(s),Streams(billions),Release date,Ref.
0,1,Blinding Lights,The Weeknd,5.0,29 November 2019,[1]
1,2,Shape of You,Ed Sheeran,4.528,6 January 2017,[2]
2,3,Starboy,The WeekndandDaft Punk,4.063,21 September 2016,[3]
3,4,Someone You Loved,Lewis Capaldi,4.032,8 November 2018,[4]
4,5,As It Was,Harry Styles,4.021,1 April 2022,[5]


### Data preprocessing and analysis

In [145]:
df_baking = df_raw.copy()
df_baking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Rank               101 non-null    object
 1   Song               100 non-null    object
 2   Artist(s)          100 non-null    object
 3   Streams(billions)  100 non-null    object
 4   Release date       100 non-null    object
 5   Ref.               100 non-null    object
dtypes: object(6)
memory usage: 4.9+ KB


In [146]:
display(df_baking[df_baking.isna().any(axis=1)]) # This is the last line of the wikipedia table, thus will be dropped
df_baking = df_baking.dropna()

Unnamed: 0,Rank,Song,Artist(s),Streams(billions),Release date,Ref.
100,As of 31 August 2025,,,,,


In [147]:
df_baking.columns = df_baking.columns.str.lower()

### Cleaning and dtypes

- Rank column can be dropped, since the rank can be implicitly known by sorting the values by # of streams
- Songs $\rightarrow$ str
- Artists $\rightarrow$ category (Only a finite number of artist for different songs)
- Release date $\rightarrow$ datetime (extract year)
- Ref column can be dropped, since they refer to an external link inside wikipedia, which is not useful for analysis

In [148]:
# Drop unnecesary columns
df_baking = df_baking.drop(columns=['rank', 'ref.'])

In [149]:
# Converting str columns
df_baking['song'] = df_baking['song'].astype('str')

# Cleaning artist names 
df_baking["artist(s)"] = df_baking["artist(s)"].str.replace("and", " and ", regex=False)
df_baking["artist(s)"] = df_baking["artist(s)"].str.replace("with", " with ", regex=False)
df_baking['artist(s)'] = df_baking['artist(s)'].astype('category')

In [150]:
# Converting release into date dtype
df_baking['release date'] = pd.to_datetime(df_baking['release date']).dt.year
df_baking['release date']

0     2019
1     2017
2     2016
3     2018
4     2022
      ... 
95    2020
96    2003
97    2017
98    2017
99    1988
Name: release date, Length: 100, dtype: int32

In [151]:
df = df_baking.copy()
df.sample(10)

Unnamed: 0,song,artist(s),streams(billions),release date
15,Dance Monkey,Tones and I,3.314,2019
52,Mr. Brightside,The Killers,2.761,2003
74,Smells Like Teen Spirit,Nirvana,2.544,1991
28,Cruel Summer,Taylor Swift,3.048,2019
35,Goosebumps,Travis Scott and Kendrick Lamar,2.924,2016
84,Happier,Marshmello and Bastille,2.465,2018
39,Just the Way You Are,Bruno Mars,2.882,2010
11,Heat Waves,Glass Animals,3.509,2020
55,Without Me,Eminem,2.752,2002
21,The Night We Met,Lord Huron,3.235,2015


### Visualization