# Import the libraries that are necesaries

In [7]:
import pandas as pd
import numpy as np

# Read the data from the first dataset

In [8]:
# Read the entire CSV file into memory
data = pd.read_csv('clean_lego.csv')

# Separate data into different DataFrames based on tables

### Create a csv for the themes

In [9]:
# Extract unique themes from the 'theme_name' column
themes = data['theme_name'].unique()

# Create a DataFrame for the Themes table with 'id' and 'name' columns
themes_df = pd.DataFrame({'theme_id': range(1, len(themes) + 1), 'name': themes})

# Save the Themes table data to a new CSV file
themes_df.to_csv('themes.csv', index=False)
themes_df

Unnamed: 0,theme_id,name
0,1,Skylines
1,2,Friends
2,3,Angry Birds
3,4,Creator 3-in-1
4,5,Trains
...,...,...
88,89,Service Packs
89,90,Dimensions
90,91,Ultimate Collector Series
91,92,Marvel Series 2


### Create a csv for the sets

In [10]:
# Extracting required columns for the Sets table
sets_data = data[['sets_name', 'year', 'theme_name', 'sets_url']]

# Assume you have a CSV with Themes table data containing columns: id and name
themes_data = pd.read_csv('themes.csv')

# Merging themes_data with sets_data to get theme_id
merged_data = sets_data.merge(themes_data, left_on='theme_name', right_on='name', how='left')

# Selecting relevant columns for the Sets table
sets_table_data = merged_data[['sets_name', 'year', 'theme_id', 'sets_url']]
sets_table_data.columns = ['name', 'year', 'theme_id', 'img_url']
sets_table_data = sets_table_data.rename_axis('set_num').reset_index()

# Save the Sets table data to a new CSV file
sets_table_data.to_csv('sets.csv', index=False)
sets_table_data

Unnamed: 0,set_num,name,year,theme_id,img_url
0,0,London,2016,1,https://cdn.rebrickable.com/media/sets/21034-1...
1,1,London,2016,1,https://cdn.rebrickable.com/media/sets/21034-1...
2,2,London,2016,1,https://cdn.rebrickable.com/media/sets/21034-1...
3,3,London,2016,1,https://cdn.rebrickable.com/media/sets/21034-1...
4,4,London,2016,1,https://cdn.rebrickable.com/media/sets/21034-1...
...,...,...,...,...,...
2329503,2329503,High-speed Chase,2017,32,https://cdn.rebrickable.com/media/sets/60138-1...
2329504,2329504,High-speed Chase,2017,32,https://cdn.rebrickable.com/media/sets/60138-1...
2329505,2329505,High-speed Chase,2017,32,https://cdn.rebrickable.com/media/sets/60138-1...
2329506,2329506,High-speed Chase,2017,32,https://cdn.rebrickable.com/media/sets/60138-1...


### Create a csv for the part categories

In [11]:
# Extract unique part categories from the 'part_category' column
part_categories = data['part_category'].unique()

# Create a DataFrame for the PartCategories table
part_categories_df = pd.DataFrame({'name': part_categories})

# Add an 'id' column for PartCategories with auto-incremented values
part_categories_df['id'] = range(1, len(part_categories_df) + 1)

# Reorder the columns to match the schema
part_categories_df = part_categories_df[['id', 'name']]

# Save the PartCategories table data to a new CSV file
part_categories_df.to_csv('part_categories.csv', index=False)
part_categories_df

Unnamed: 0,id,name
0,1,Plates
1,2,Tiles
2,3,Plates Special
3,4,Bricks Curved
4,5,Tiles Special
...,...,...
56,57,HO Scale
57,58,Rock
58,59,Mechanical
59,60,Minidoll Lower Body


### Create a csv for the parts

In [12]:
# Extracting required columns for the Parts table
parts_data = data[['part_name', 'part_category', 'part_material', 'rgb']]

# Assume you have a CSV with PartCategories table data containing columns: id and name
part_categories_data = pd.read_csv('part_categories.csv')

# Merging part_categories_data with parts_data to get part_cat_id
merged_data = parts_data.merge(part_categories_data, left_on='part_category', right_on='name', how='left')

# Selecting relevant columns for the Parts table
parts_table_data = merged_data[['part_name', 'id', 'part_material']]
parts_table_data.columns = ['name', 'part_cat_id', 'part_material']

# Generate part_num based on index
parts_table_data['part_num'] = parts_table_data.index.astype(str)

# Reordering columns to match the schema
parts_table_data = parts_table_data[['part_num', 'name', 'part_cat_id', 'part_material']] 

# Extract unique themes from the 'theme_name' column
themes = data['theme_name'].unique()

# Create a DataFrame for the Themes table with 'id' and 'name' columns
themes_df = pd.DataFrame({'theme_id': range(1, len(themes) + 1), 'name': themes})

# Merge themes_df with parts_table_data based on 'name' (theme_name) to add theme_id column
parts_table_data = parts_table_data.merge(themes_df, left_on='name', right_on='name', how='left')

# # Save the Parts table data to a new CSV file
parts_table_data.to_csv('parts.csv', index=False)
parts_table_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parts_table_data['part_num'] = parts_table_data.index.astype(str)


Unnamed: 0,part_num,name,part_cat_id,part_material,theme_id
0,0,Plate 4 x 8,1,Plastic,
1,1,Plate 4 x 8,1,Plastic,
2,2,Plate 4 x 8,1,Plastic,
3,3,Plate 4 x 8,1,Plastic,
4,4,Plate 4 x 8,1,Plastic,
...,...,...,...,...,...
2329503,2329503,Headwear Accessory Visor For Standard Helmet,16,Plastic,
2329504,2329504,Headwear Accessory Visor For Standard Helmet,16,Plastic,
2329505,2329505,Headwear Accessory Visor For Standard Helmet,16,Plastic,
2329506,2329506,Headwear Accessory Visor For Standard Helmet,16,Plastic,


### Create a csv for the colors

In [13]:
# Extracting required columns for the Colors table
colors_data = data[['part_color', 'rgb', 'is_transparent?', 'theme_name']]

# Assume you have a CSV with Themes table data containing columns: id and name
themes_data = pd.read_csv('themes.csv')

# Merging themes_data with colors_data to get theme_id
merged_data = colors_data.merge(themes_data, left_on='theme_name', right_on='name', how='left')

# Add an 'id' column for Colors with unique identifiers
merged_data['id'] = range(1, len(merged_data) + 1)

# Selecting relevant columns for the Colors table
colors_table_data = merged_data[['id', 'part_color', 'rgb', 'is_transparent?', 'theme_id']]
colors_table_data.columns = ['id', 'name', 'rgb', 'is_transparent', 'theme_id']

# Save the updated Colors table data to a new CSV file
colors_table_data.to_csv('colors.csv', index=False)
colors_table_data

Unnamed: 0,id,name,rgb,is_transparent,theme_id
0,1,Dark Blue,0A3463,False,1
1,2,Trans-Dark Blue,0020A0,True,1
2,3,Bright Green,4B9F4A,False,1
3,4,Bright Light Blue,9FC3E9,False,1
4,5,Dark Brown,352100,False,1
...,...,...,...,...,...
2329503,2329504,Tan,E4CD9E,False,32
2329504,2329505,White,FFFFFF,False,32
2329505,2329506,Trans-Orange,F08F1C,True,32
2329506,2329507,Brown,583927,False,32


### Create a csv for the sets details

In [14]:
import csv

# File paths
original_file_path = 'clean_lego.csv'
sets_table_path = 'sets.csv'
set_details_output_path = 'set_details.csv'

# Store unique set_num values
unique_set_nums = set()

# Open Sets table data for merging
with open(sets_table_path, 'r') as sets_file:
    sets_reader = csv.DictReader(sets_file)
    sets_data = {row['name']: row['set_num'] for row in sets_reader}

# Open original file for reading and SetDetails output file for writing
with open(original_file_path, 'r') as original_file, open(set_details_output_path, 'w', newline='') as set_details_output_file:
    original_reader = csv.DictReader(original_file)
    set_details_writer = csv.writer(set_details_output_file)

    # Write header for SetDetails table
    set_details_writer.writerow(['set_num', 'set_price', 'number_of_reviews', 'star_rating'])

    # Process rows from the original file
    for row in original_reader:
        set_name = row['sets_name']
        set_num = sets_data.get(set_name, '')  # Get corresponding set_num from Sets data

        # Ensure set_num is unique before writing to the file
        if set_num and set_num not in unique_set_nums:
            unique_set_nums.add(set_num)

            # Write SetDetails data to output file
            set_details_writer.writerow([set_num, row['set_price'], row['number_of_reviews'], row['star_rating']])
        
sets = pd.read_csv('set_details.csv')
sets

Unnamed: 0,set_num,set_price,number_of_reviews,star_rating
0,1974807,53.36,23.000000,4.700000
1,2116846,14.51,1.000000,4.000000
2,2009200,30.59,2.000000,4.500000
3,2083275,12.74,6.000000,4.500000
4,1854186,224.40,35.380952,4.400000
...,...,...,...,...
522,1646252,7.89,24.000000,3.600000
523,2235233,3.55,8.000000,3.400000
524,1835018,29.99,1.000000,5.000000
525,1852933,41.92,16.826238,4.514134


# Read the data from the second dataset

In [15]:
# Read the entire CSV file into memory
df = pd.read_csv('lego_set_clean.csv')
df.columns

Index(['ages', 'list_price', 'num_reviews', 'piece_count', 'play_star_rating',
       'prod_desc', 'prod_id', 'prod_long_desc', 'review_difficulty',
       'set_name', 'star_rating', 'theme_name', 'val_star_rating', 'country',
       'collaboration'],
      dtype='object')

# Separate data into different DataFrames based on tables

### Create a csv for the lego themes

In [16]:
# Extract unique theme names
themes_data = df['theme_name'].unique()

# Create a mapping of theme_name to theme_id
theme_id_mapping = {theme_name: idx + 1 for idx, theme_name in enumerate(themes_data)}

# Add a new 'theme_id' column using the mapping
df['theme_id'] = df['theme_name'].map(theme_id_mapping)

# Extract 'theme_id' and 'theme_name' columns
themes_data = df[['theme_id', 'theme_name']].copy()

# Drop duplicate rows to ensure uniqueness in the extracted data
themes_data.drop_duplicates(inplace=True)

# Save the Themes DataFrame to a CSV file
themes_data.to_csv('LegoThemes.csv', index=False)
themes_data

Unnamed: 0,theme_id,theme_name
0,1,Angry Birds™
3,2,Architecture
13,3,BOOST
14,4,BrickHeadz
46,5,City
111,6,Classic
126,7,Creator 3-in-1
152,8,Creator Expert
168,9,THE LEGO® BATMAN MOVIE
169,10,DC Comics™ Super Heroes


### Create a csv for the lego sets

In [17]:
# Assuming list_price is a column in the original DataFrame 'df'
sets_data = df[['set_name', 'theme_name', 'piece_count', 'play_star_rating', 'val_star_rating', 'list_price']]

# Clean theme_name column in sets_data
sets_data['theme_name'] = sets_data['theme_name'].str.strip()
themes_data['theme_name'] = themes_data['theme_name'].str.strip()

# Recreate mapping dictionary after cleaning data
theme_mapping = dict(zip(themes_data['theme_name'], themes_data['theme_id']))

# Map theme_id based on cleaned theme_name in sets_data
sets_data['theme_id'] = sets_data['theme_name'].map(theme_mapping)

# Drop theme_name column
sets_data.drop(columns=['theme_name'], inplace=True)

# Create set_id column based on index
sets_data['set_id'] = sets_data.index + 1

# Reorder columns
sets_data = sets_data[['set_id', 'set_name', 'theme_id', 'piece_count', 'play_star_rating', 'val_star_rating', 'list_price']]

# Assuming list_price is a column in the sets_data DataFrame
sets_data['list_price'] = sets_data['list_price'].round(2)

sets_data.to_csv('LegoSets.csv', index=False)
sets_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sets_data['theme_name'] = sets_data['theme_name'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sets_data['theme_id'] = sets_data['theme_name'].map(theme_mapping)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sets_data.drop(columns=['theme_name'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

Unnamed: 0,set_id,set_name,theme_id,piece_count,play_star_rating,val_star_rating,list_price
0,1,Bird Island Egg Heist,1,277.0,4.0,4.0,29.99
1,2,Piggy Plane Attack,1,168.0,4.0,4.0,19.99
2,3,Piggy Car Escape,1,74.0,4.3,4.1,12.99
3,4,United States Capitol Building,2,1032.0,3.6,4.3,99.99
4,5,Solomon R. Guggenheim Museum®,2,744.0,3.2,4.1,79.99
...,...,...,...,...,...,...,...
8832,8833,The Riddler™ Riddle Racer,9,254.0,4.2,4.4,42.69
8833,8834,Scarecrow™ Special Delivery,9,204.0,4.1,4.1,30.49
8834,8835,Mr. Freeze™ Ice Attack,9,201.0,4.1,4.4,30.49
8835,8836,Scarecrow™ Fearful Face-off,9,141.0,4.0,4.4,18.29


### Create a csv for the lego products

In [18]:
products_data = df[['prod_id', 'prod_desc', 'prod_long_desc', 'collaboration', 'country']]
products_data.to_csv('LegoProducts.csv', index=False)
products_data

Unnamed: 0,prod_id,prod_desc,prod_long_desc,collaboration,country
0,75823.0,Catapult into action and take back the eggs fr...,Use the staircase catapult to launch Red into ...,Collaboration,US
1,75822.0,Launch a flying attack and rescue the eggs fro...,Pilot Pig has taken off from Bird Island with ...,Collaboration,US
2,75821.0,Chase the piggy with lightning-fast Chuck and ...,Pitch speedy bird Chuck against the Piggy Car....,Collaboration,US
3,21030.0,Explore the architecture of the United States ...,Discover the architectural secrets of the icon...,Regular Theme,US
4,21035.0,Recreate the Solomon R. Guggenheim Museum® wit...,Discover the architectural secrets of Frank Ll...,Regular Theme,US
...,...,...,...,...,...
8832,70903.0,Battle with Batman™ against The Riddler™ and h...,Take on four Super-Villains at once in THE LEG...,Collaboration,PT
8833,70910.0,Help Batman™ to stop Scarecrow™ delivering a s...,Save Gotham City Energy Facility from a fear g...,Collaboration,PT
8834,70901.0,Protect the power plant from Mr. Freeze’s ice ...,Help Batman™ defend the Gotham City Energy Fac...,Collaboration,PT
8835,70913.0,Power into the sky to stop Scarecrow’s fear ga...,Team up with Batman™ to stop Scarecrow™ spread...,Collaboration,PT


### Create a csv for the lego reviews

In [19]:
reviews_data = df[['set_name', 'prod_id', 'num_reviews', 'star_rating', 'review_difficulty', 'prod_desc', 'prod_long_desc']]

# Select columns for LegoReview table
lego_reviews_data = reviews_data[[
    'prod_id', 'set_name', 'num_reviews', 'star_rating',
    'review_difficulty', 'prod_desc', 'prod_long_desc'
]].copy()

# Create review_id column based on index
lego_reviews_data['review_id'] = lego_reviews_data.index + 1

# Reorder columns
lego_reviews_data = lego_reviews_data[[
    'review_id', 'set_name', 'prod_id', 'num_reviews',
    'star_rating', 'review_difficulty', 'prod_desc', 'prod_long_desc'
]]

# Merge lego_reviews_data with sets_data to get set_id based on set_name
lego_reviews_data = pd.merge(lego_reviews_data, sets_data[['set_id', 'set_name']], on='set_name', how='left')

# Drop the original set_name column
lego_reviews_data.drop(columns=['set_name'], inplace=True)

# Reorder columns
lego_reviews_data = lego_reviews_data[[
    'review_id', 'set_id', 'prod_id', 'num_reviews',
    'star_rating', 'review_difficulty', 'prod_desc', 'prod_long_desc'
]]

# Keep only the first occurrence of each unique review_id
lego_reviews_data.drop_duplicates(subset='review_id', keep='first', inplace=True)

# Save lego_reviews_data to a new CSV file
lego_reviews_data.to_csv('LegoReviews.csv', index=False)
lego_reviews_data

Unnamed: 0,review_id,set_id,prod_id,num_reviews,star_rating,review_difficulty,prod_desc,prod_long_desc
0,1,1,75823.0,2.0,4.5,Average,Catapult into action and take back the eggs fr...,Use the staircase catapult to launch Red into ...
2,2,2,75822.0,2.0,5.0,Easy,Launch a flying attack and rescue the eggs fro...,Pilot Pig has taken off from Bird Island with ...
4,3,3,75821.0,11.0,4.3,Easy,Chase the piggy with lightning-fast Chuck and ...,Pitch speedy bird Chuck against the Piggy Car....
6,4,4,21030.0,23.0,4.6,Average,Explore the architecture of the United States ...,Discover the architectural secrets of the icon...
27,5,5,21035.0,14.0,4.6,Challenging,Recreate the Solomon R. Guggenheim Museum® wit...,Discover the architectural secrets of Frank Ll...
...,...,...,...,...,...,...,...,...
176859,8833,589,70903.0,18.0,4.2,Easy,Battle with Batman™ against The Riddler™ and h...,Take on four Super-Villains at once in THE LEG...
176879,8834,590,70910.0,8.0,4.4,Average,Help Batman™ to stop Scarecrow™ delivering a s...,Save Gotham City Energy Facility from a fear g...
176900,8835,591,70901.0,19.0,4.6,Easy,Protect the power plant from Mr. Freeze’s ice ...,Help Batman™ defend the Gotham City Energy Fac...
176921,8836,592,70913.0,7.0,4.1,Easy,Power into the sky to stop Scarecrow’s fear ga...,Team up with Batman™ to stop Scarecrow™ spread...


### Create a csv for the lego ages

In [20]:
# Select columns for LegoAge table
ages_data = df[['prod_id', 'ages']].copy()

# Extract unique age ranges
unique_age_ranges = ages_data['ages'].unique()

# Generate unique age_id values
age_id_mapping = {age_range: idx + 1 for idx, age_range in enumerate(unique_age_ranges)}

# Map age_range to unique age_id
ages_data['age_id'] = ages_data['ages'].map(age_id_mapping)

# Reorder columns and select necessary columns for LegoAge table
ages_data = ages_data[['age_id', 'prod_id', 'ages']].copy()

# Rename 'ages' column to 'age_range'
ages_data.rename(columns={'ages': 'age_range'}, inplace=True)

# Drop duplicates to get unique age_id entries
ages_data = ages_data.drop_duplicates(subset=['age_id'])

ages_data.to_csv('LegoAges.csv', index=False)
ages_data

Unnamed: 0,age_id,prod_id,age_range
0,1,75823.0,6-12
3,2,21030.0,12+
13,3,17101.0,7-12
14,4,41597.0,10+
48,5,60162.0,8-12
55,6,60140.0,5-12
111,7,10698.0,4-99
123,8,630.0,4+
127,9,31070.0,9-12
152,10,10256.0,16+
