In [5]:
from bs4 import BeautifulSoup
import os
import sqlite3
import pandas as pd
import progressbar

In [12]:
def treat_ratings_and_reviews(string):
    subject = str(string).strip() or "0"
    subject = subject.replace(",", "")
    return int(subject)


def convert_month_to_number(word_month):
    months = [
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
    ]
    for i in range(0, len(months)):
        if word_month == months[i]:
            return i + 1

    raise Exception("Invalid Month Error")


def convert_date_to_numeric(date):
    date_split = date.split(" ")
    year = 2000
    month = 1
    day = 1
    for i in range(0, len(date_split)):
        current_index = len(date_split) - (i + 1)
        if i == 0:
            year = date_split[current_index]
        elif i == 1:
            numeric_month = convert_month_to_number(date_split[current_index])
            month = numeric_month
        elif i == 2:
            day = date_split[current_index]

    return f"{month}/{day}/{year}"


def get_text_from_links(div, class_name):
    result = []
    links = div.find_all("a", class_=class_name)
    for link in links:
        link_text = link.get_text()
        result.append(link_text)
    return result


def get_text_from_nested_span(parent, class_name):
    span = parent.find("span", class_=class_name)
    if span:
        nested_span = span.find("span", class_="full")
        if nested_span:
            return nested_span.get_text()
    return None


def get_descriptors(descriptor_div):
    result = []
    if not descriptor_div:
        return None
    comma_separated_spans = descriptor_div.find_all("span", class_="comma_separated")
    for span in comma_separated_spans:
        result.append(span.text)

    return result


def get_genres(genre_div):
    if not genre_div:
        return None
    return get_text_from_links(genre_div, "genre comma_separated")


class AlbumScraper:
    def __init__(self, album_div):
        self.album = album_div

    def get_album_data(self):
        album_title_div = self.album.find(
            "div", class_="page_charts_section_charts_item_title"
        )
        album_title = album_title_div.find(
            "span", class_="ui_name_locale_original"
        ).text
        artist_name_a = self.album.find("a", class_="artist")

        if artist_name_a:
            artist_name_span = artist_name_a.find(
                "span", class_="ui_name_locale_original"
            ) or artist_name_a.find("span", class_="ui_name_locale")
            artist_name = (
                artist_name_span.text if artist_name_span else "Various Artists"
            )
        else:
            artist_name = "Various Artists"
        genre_div = self.album.find(
            "div", class_="page_charts_section_charts_item_genres_primary"
        )
        genre2_div = self.album.find(
            "div", class_="page_charts_section_charts_item_genres_secondary"
        )
        descriptor_div = self.album.find(
            "div", class_="page_charts_section_charts_item_genre_descriptors"
        )
        average_score = self.album.find(
            "span", class_="page_charts_section_charts_item_details_average_num"
        ).text
        if average_score == "":
            average_score = "0"

        date_div = self.album.find("div", class_="page_charts_section_charts_item_date")
        date_span = date_div.find("span")
        date_numeric = convert_date_to_numeric(date_span.text)

        ratings_text = get_text_from_nested_span(
            self.album, "page_charts_section_charts_item_details_ratings"
        )
        reviews_text = get_text_from_nested_span(
            self.album, "page_charts_section_charts_item_details_reviews"
        )
        ratings = treat_ratings_and_reviews(ratings_text)
        reviews = treat_ratings_and_reviews(reviews_text)
        # print(album_title)
        # print(get_genres(genre_div))
        result = {
            "Album_Name": album_title,
            "Artist_Name": artist_name,
            "Genres": get_genres(genre_div),
            "Secondary_Genres": get_genres(genre2_div),
            "Descriptors": get_descriptors(descriptor_div),
            "Average_Rating": float(average_score),
            "Rating_Count": ratings,
            "Review_Count": reviews,
            "Release_Date": date_numeric,
        }
        return result


In [7]:
def scrape_html_file(filepath):

    result = []
    with open(filepath, "r") as file:
        if not file:
            raise FileNotFoundError
        string = file.read()
        html_soup = BeautifulSoup(string, "html.parser")
        album_divs = html_soup.select("div.page_section_charts_item_wrapper.anchor")
        counter = 0
        for album_div in album_divs:
            counter += 1
            album = AlbumScraper(album_div)
            album_data = album.get_album_data()
            result.append(album_data)

    return result

def walk_directory(directory):
    all_albums_df = pd.DataFrame()

    for dirpath, dirnames, filenames in os.walk(directory):
        bar = progressbar.ProgressBar(max_value=len(filenames))
        counter = 0
        for filename in filenames:
            bar.update(counter)
            counter += 1
            if filename.endswith(".html"):
                filepath = os.path.join(dirpath, filename)
                page_albums_df = pd.DataFrame(scrape_html_file(filepath))
                
                all_albums_df = pd.concat([all_albums_df, page_albums_df], ignore_index=True)
        bar.finish()

    return all_albums_df



[38;2;0;255;0m100%[39m [38;2;0;255;0m(26 of 26)[39m |########################| Elapsed Time: 0:00:02 Time:  0:00:020000


          Album-Name    Artist                            Genres  \
0    Little Machines    Lights            [Electropop, Synthpop]   
1            Extance    Aenaon  [Black Metal, Avant-Garde Metal]   
2  Black Moon Rising  Falconer                     [Power Metal]   
3         Belomancie  Sun Araw     [Electronic, Neo-Psychedelia]   
4           The Oath  The Oath                     [Heavy Metal]   

                                    Secondary-Genres  \
0                 [Indietronica, New Wave, Pop Rock]   
1                                [Progressive Metal]   
2                         [Folk Metal, Thrash Metal]   
3  [Free Improvisation, Ambient Dub, Neo-Psychede...   
4              [Traditional Doom Metal, Heavy Psych]   

                                         Descriptors  Score  Ratings  Reviews  \
0  [energetic, longing, anthemic, love, passionat...   3.33      347        5   
1             [melodic, heavy, avant-garde, ominous]   3.34      345        9   
2          

# Clean and Read Data

In [17]:
def clean_data(df):
    # Fill missing values
    df["Album_Name"].fillna("Unknown", inplace=True)
    df["Artist_Name"].fillna("Unknown", inplace=True)
    df["Release_Date"] = pd.to_datetime(df["Release_Date"]).dt.strftime(
        "%Y-%m-%d"
    )  
    df["Genres"].fillna("None", inplace=True)
    df["Descriptors"].fillna("None", inplace=True)
    df["Average_Rating"].fillna(
        df["Average_Rating"].mean(), inplace=True
    )  
    df["Rating_Count"].fillna(0, inplace=True)
    df["Review_Count"].fillna(0, inplace=True)

    df["Average_Rating"] = df["Average_Rating"].astype(float)
    df["Rating_Count"] = df["Rating_Count"].astype(int)
    df["Review_Count"] = df["Review_Count"].astype(int)

    df["Genres"] = df["Genres"].apply(lambda x: x if isinstance(x, list) else (x.split(",") if x else []))
    df["Descriptors"] = df["Descriptors"].apply(lambda x: x if isinstance(x, list) else (x.split(",") if x else []))

    subset_columns = [
        "Album_Name",
        "Artist_Name",
        "Release_Date",
        "Average_Rating",
        "Rating_Count",
        "Review_Count",
    ]
    df.drop_duplicates(subset=subset_columns, inplace=True)

    return df
    
df = walk_directory('../../HtmlData/TestAlbums')
print(df.head())
clean_data(df)
print(df.head())

[38;2;0;255;0m100%[39m [38;2;0;255;0m(26 of 26)[39m |########################| Elapsed Time: 0:00:02 Time:  0:00:020000


          Album_Name Artist_Name                            Genres  \
0    Little Machines      Lights            [Electropop, Synthpop]   
1            Extance      Aenaon  [Black Metal, Avant-Garde Metal]   
2  Black Moon Rising    Falconer                     [Power Metal]   
3         Belomancie    Sun Araw     [Electronic, Neo-Psychedelia]   
4           The Oath    The Oath                     [Heavy Metal]   

                                    Secondary_Genres  \
0                 [Indietronica, New Wave, Pop Rock]   
1                                [Progressive Metal]   
2                         [Folk Metal, Thrash Metal]   
3  [Free Improvisation, Ambient Dub, Neo-Psychede...   
4              [Traditional Doom Metal, Heavy Psych]   

                                         Descriptors  Average_Rating  \
0  [energetic, longing, anthemic, love, passionat...            3.33   
1             [melodic, heavy, avant-garde, ominous]            3.34   
2                         