### 1. Web scraping for Billboard Hot 100 songs

In [72]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def scrape_billboard_hot_100():
    url = "https://www.billboard.com/charts/hot-100/"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        songs = []
        artists = []
        
        # Find all list items containing song information
        chart_items = soup.select('ul.o-chart-results-list-row')
        
        for item in chart_items:
            try:
                # Extract song title
                title_element = item.select_one('h3#title-of-a-story')
                if title_element:
                    song = title_element.text.strip()
                    songs.append(song)
                
                # Extract artist name
                artist_element = item.select_one('span.c-label.a-font-primary-s')
                if artist_element:
                    artist = artist_element.text.strip()
                    artists.append(artist)
            except Exception as e:
                print(f"Error processing item: {e}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame({
            'Song': songs,
            'Artist': artists
        })
        
        # Save to CSV
        df.to_csv('billboard_hot_100.csv', index=False)
        print(f"Successfully scraped {len(df)} songs")
        
        return df
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Execute the scraping
df = scrape_billboard_hot_100()

# Display first few rows if successful
if df is not None:
    display(df.head())

Successfully scraped 100 songs


Unnamed: 0,Song,Artist
0,Die With A Smile,Lady Gaga & Bruno Mars
1,A Bar Song (Tipsy),Shaboozey
2,Birds Of A Feather,Billie Eilish
3,Lose Control,Teddy Swims
4,APT.,ROSE & Bruno Mars


### 2. Reading the CSV file for the million song subset.

In [73]:
url = "../data/2_million_song_subset_raw.csv"
df2 = pd.read_csv(url, delimiter=';')

display(df2.head())

Unnamed: 0,title,artist
0,b'Je Sais Que La Terre Est Plate',b'Rapha\xc3\xabl'
1,b'On Efface',b'Julie Zenatti'
2,b'Howells Delight',b'The Baltimore Consort'
3,b'Martha Served',b'I Hate Sally'
4,b'Zip-A-Dee-Doo-Dah (Song of the South)',b'Orlando Pops Orchestra'


### 3. Cleaning the million song subset.

In [74]:
df2 = df2.apply(lambda col: col.str.replace('b', '', regex=False))
df2 = df2.apply(lambda col: col.str.strip("'"))
df2.columns = ['Song', 'Artist']
print(df2)

                                       Song                  Artist
0            Je Sais Que La Terre Est Plate           Rapha\xc3\xal
1                                 On Efface           Julie Zenatti
2                           Howells Delight   The Baltimore Consort
3                             Martha Served            I Hate Sally
4     Zip-A-Dee-Doo-Dah (Song of the South)  Orlando Pops Orchestra
...                                     ...                     ...
9995                        One Aout Heaven               Brent Lam
9996                                 Octoer                      U2
9997                          "Comin' Home"                     ZO2
9998                         Pode Me Chamar                   Eddie
9999                              Souffle 2          Vincent Bruley

[10000 rows x 2 columns]


### 4. Combining the two datasets with a boolean column to indicate if the song is in the Hot 100.

In [75]:
df['Hot_100'] = True  # Rows in df are Hot_100
df2['Hot_100'] = False  # Rows in df2 are not Hot_100

# Combine both DataFrames into one
combined_df = pd.concat([df, df2], ignore_index=True)

print(combined_df)

                     Song                  Artist  Hot_100
0        Die With A Smile  Lady Gaga & Bruno Mars     True
1      A Bar Song (Tipsy)               Shaboozey     True
2      Birds Of A Feather           Billie Eilish     True
3            Lose Control             Teddy Swims     True
4                    APT.       ROSE & Bruno Mars     True
...                   ...                     ...      ...
10095     One Aout Heaven               Brent Lam    False
10096              Octoer                      U2    False
10097       "Comin' Home"                     ZO2    False
10098      Pode Me Chamar                   Eddie    False
10099           Souffle 2          Vincent Bruley    False

[10100 rows x 3 columns]


In [76]:
combined_df.to_csv('3_combined_clean_dataset.csv', index=False)

In [77]:
combined_df.isnull().sum()

Song       0
Artist     0
Hot_100    0
dtype: int64

In [78]:
combined_df.duplicated().sum()

np.int64(58)

In [79]:
combined_df.drop_duplicates(inplace=True)
combined_df.to_csv('3_combined_clean_dataset.csv', index=False)
