### Data Cleaning

# LEGO Set Metadata

This table outlines the metadata for LEGO sets, including various attributes of each set.

| Field           | Description                                                 |
|-----------------|-------------------------------------------------------------|
| `set_id`        | Official LEGO item number                                   |
| `name`          | Name of the LEGO set                                        |
| `year`          | Release year                                                |
| `theme`         | LEGO theme the set belongs to                               |
| `subtheme`      | Subtheme within the theme                                   |
| `themeGroup`    | Overall group the theme belongs to                          |
| `category`      | Type of set                                                 |
| `pieces`        | Number of pieces in the set                                 |
| `minifigs`      | Number of mini figures included in the set                  |
| `agerange_min`  | Minimum age recommended                                     |
| `US_retailPrice`| US retail price at launch                                   |
| `bricksetURL`   | URL for the set on brickset.com                              |
| `thumbnailURL`  | Small image of the set                                      |
| `imageURL`      | Full size image of the set                                  |


In [87]:
import pandas as pd

In [88]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'utils')))

import data_processing_functions as dpf

In [89]:
lego_sets = pd.read_csv('../data/raw/lego_sets.csv')

### Basic Information

In [None]:
dpf.show_basic_info(lego_sets)

In [None]:
dpf.show_data_types(lego_sets)

In [None]:
dpf.show_missing_values(lego_sets)

In [None]:
dpf.show_null_percentage(lego_sets)

#### Check for Duplicates

In [None]:
dpf.check_for_duplicates(lego_sets)

## 1. Standardize and format columns

#### Rename Columns

In [95]:
column_rename_dict = {
    'set_id': 'set_id',                    # Official LEGO item number
    'name': 'set_name',                    # Name of the LEGO set
    'year': 'release_year',                # Release year
    'theme': 'theme_name',                 # LEGO theme the set belongs to
    'subtheme': 'subtheme_name',           # Subtheme within the theme
    'themeGroup': 'theme_group',           # Overall group the theme belongs to
    'category': 'set_category',            # Type of set (e.g., playset, vehicle)
    'pieces': 'num_pieces',                # Number of pieces in the set
    'minifigs': 'num_minifigs',            # Number of mini figures included
    'agerange_min': 'min_age_recommended', # Minimum age recommended
    'US_retailPrice': 'us_retail_price',   # US retail price at launch
    'bricksetURL': 'brickset_url',         # URL for the set on brickset.com
    'thumbnailURL': 'thumbnail_url',       # Small image of the set
    'imageURL': 'image_url'           # Full size image of the set
}

lego_sets = dpf.rename_columns(lego_sets, column_rename_dict)

In [None]:
lego_sets.head()

#### Unique values in string columns

In [None]:
dpf.show_column_value_counts(lego_sets, 'set_name')
dpf.show_column_value_counts(lego_sets, 'set_category')
dpf.show_column_value_counts(lego_sets, 'theme_name')
dpf.show_column_value_counts(lego_sets, 'subtheme_name')
dpf.show_column_value_counts(lego_sets, 'theme_group')

#### Convert string columns to lowercase
- Standardize small differences (& / and, etc)

In [None]:
columns_to_lowercase = ['set_name', 'set_category', 'theme_name', 'subtheme_name', 'theme_group']
for column in columns_to_lowercase:
    dpf.convert_strings_to_lowercase(lego_sets, column)
    dpf.clean_text(column)
    lego_sets[column] = lego_sets[column].str.replace('&', 'and', regex=False)
    lego_sets[column] = lego_sets[column].str.replace(' / ', '/', regex=False)
    lego_sets[column] = lego_sets[column].str.replace('vs.', 'vs', regex=False)
    print(f"\n Unique values in '{column}':\n{lego_sets[column].unique()}\n")

#### Clean numerical columns

In [None]:
columns_to_int = ['min_age_recommended', 'num_minifigs', 'num_pieces', 'release_year']
for column in columns_to_int:
    dpf.convert_columns_to_int(lego_sets, [column])
    print(f"\n Unique values in {column} column: {lego_sets[column].unique()}\n")

## 2. Handle missing values

#### Drop empty rows in specific columns
- price
- pieces

In [None]:
lego_sets = dpf.drop_empty_rows_from_column(lego_sets, 'us_retail_price')
lego_sets = dpf.drop_empty_rows_from_column(lego_sets, 'num_pieces')
lego_sets = dpf.drop_empty_rows_from_column(lego_sets, 'thumbnail_url')


#### Replace empty values
- with 0 in numerical values
- with 'unknown' in subtheme column

In [101]:
lego_sets['num_pieces'] = lego_sets['num_pieces'].fillna(0)
lego_sets['min_age_recommended'] = lego_sets['min_age_recommended'].fillna(0)
lego_sets['num_minifigs'] = lego_sets['num_minifigs'].fillna(0)
lego_sets['subtheme_name'] = lego_sets['subtheme_name'].fillna('unknown')

In [None]:
dpf.show_missing_values(lego_sets)

## Derived Columns
- Add 'decade' column grouping release years into decades
- Licensed sets in 'is_licensed' column

Create 'decade' column

In [None]:
lego_sets['release_decade'] = (lego_sets['release_year'] // 10) * 10
display(lego_sets)

Mark licensed sets

In [104]:
# licensed themes
licensed_themes = [
    'star wars', 'spongebob squarepants', 'batman', 'harry potter', 'indiana jones',
    'pirates of the caribbean', 'ben 10: alien force', 'prince of persia', 'toy story',
    'dc comics super heroes', 'marvel super heroes', 'the lord of the rings', 'the hobbit',
    'minecraft', 'teenage mutant ninja turtles', 'the lone ranger', 'disney', 'the lego movie',
    'the lego batman movie', 'dc super hero girls', 'scooby-doo', 'jurassic world', 'nexo knights',
    'the angry birds movie', 'ghostbusters', 'minions: the rise of gru', 'the lego ninjago movie',
    'super mario', 'trolls world tour', 'brick sketches', 'stranger things', 'overwatch',
    'the powerpuff girls', 'the lego movie 2', 'hidden side', 'avatar'
]

# check if the theme is licensed
def check_licensed(theme):
    return theme in licensed_themes

# create the 'is_licensed' column
lego_sets['is_licensed'] = lego_sets['theme_name'].apply(check_licensed)

#### Reorder columns

In [105]:
column_order = [
    'set_id', 'set_name', 'theme_name', 'subtheme_name', 'theme_group', 'set_category', 'is_licensed', 
    'release_year', 'release_decade', 'num_pieces', 'num_minifigs', 'min_age_recommended', 
    'us_retail_price', 'brickset_url', 'thumbnail_url', 'image_url'
]

lego_sets = lego_sets[column_order]

In [None]:
display(lego_sets)

In [None]:
dpf.show_basic_info(lego_sets)

### Export to csv

In [108]:
lego_sets.reset_index(drop=True, inplace=True)  # reset index
# lego_sets.to_csv('../data/processed/lego_sets_clean.csv', index=False)