### Data Cleaning

# LEGO Set Metadata

This table outlines the metadata for LEGO sets, including various attributes of each set.

| Field           | Description                                                 |
|-----------------|-------------------------------------------------------------|
| `set_id`        | Official LEGO item number                                   |
| `name`          | Name of the LEGO set                                        |
| `year`          | Release year                                                |
| `theme`         | LEGO theme the set belongs to                               |
| `subtheme`      | Subtheme within the theme                                   |
| `themeGroup`    | Overall group the theme belongs to                          |
| `category`      | Type of set                                                 |
| `pieces`        | Number of pieces in the set                                 |
| `minifigs`      | Number of mini figures included in the set                  |
| `agerange_min`  | Minimum age recommended                                     |
| `US_retailPrice`| US retail price at launch                                   |
| `bricksetURL`   | URL for the set on brickset.com                              |
| `thumbnailURL`  | Small image of the set                                      |
| `imageURL`      | Full size image of the set                                  |


In [149]:
import pandas as pd

In [150]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'utils')))

import data_processing_functions as dpf

In [151]:
lego_sets = pd.read_csv('../data/raw/lego_sets.csv')

### Basic Information

In [None]:
dpf.show_basic_info(lego_sets)

In [None]:
dpf.show_data_types(lego_sets)

In [None]:
dpf.show_missing_values(lego_sets)

In [None]:
dpf.show_null_percentage(lego_sets)

#### Check for Duplicates

In [None]:
dpf.check_for_duplicates(lego_sets)

#### Rename Columns

In [157]:
column_rename_dict = {
    'set_id': 'set_id',                    # Official LEGO item number
    'name': 'set_name',                    # Name of the LEGO set
    'year': 'release_year',                # Release year
    'theme': 'theme_name',                 # LEGO theme the set belongs to
    'subtheme': 'subtheme_name',           # Subtheme within the theme
    'themeGroup': 'theme_group',           # Overall group the theme belongs to
    'category': 'set_category',            # Type of set (e.g., playset, vehicle)
    'pieces': 'num_pieces',                # Number of pieces in the set
    'minifigs': 'num_minifigs',            # Number of mini figures included
    'agerange_min': 'min_age_recommended', # Minimum age recommended
    'US_retailPrice': 'us_retail_price',   # US retail price at launch
    'bricksetURL': 'brickset_url',         # URL for the set on brickset.com
    'thumbnailURL': 'thumbnail_url',       # Small image of the set
    'imageURL': 'image_url'           # Full size image of the set
}

lego_sets = dpf.rename_columns(lego_sets, column_rename_dict)


In [None]:
lego_sets.head()

### Unique Values

In [None]:
dpf.show_column_summary(lego_sets)

#### Unique values in string columns

In [None]:
dpf.show_column_value_counts(lego_sets, 'set_name')
dpf.show_column_value_counts(lego_sets, 'set_category')
dpf.show_column_value_counts(lego_sets, 'theme_name')
dpf.show_column_value_counts(lego_sets, 'subtheme_name')
dpf.show_column_value_counts(lego_sets, 'theme_group')

#### Convert string columns to lowercase
- Standardize small differences (& / and, etc)

In [None]:
columns_to_lowercase = ['set_name', 'set_category', 'theme_name', 'subtheme_name', 'theme_group']
for column in columns_to_lowercase:
    dpf.convert_strings_to_lowercase(lego_sets, column)
    dpf.clean_text(column)
    lego_sets[column] = lego_sets[column].str.replace('&', 'and', regex=False)
    lego_sets[column] = lego_sets[column].str.replace(' / ', '/', regex=False)
    lego_sets[column] = lego_sets[column].str.replace('vs.', 'vs', regex=False)
    print(f"\n Unique values in '{column}' column: {lego_sets[column].unique()}\n")
    

#### Clean numerical columns
- 

In [None]:
columns_to_int = ['min_age_recommended', 'num_minifigs', 'num_pieces', 'release_year']
for column in columns_to_int:
    dpf.convert_columns_to_int(lego_sets, [column])
    print(f"\n Unique values in {column} column: {lego_sets[column].unique()}\n")
