In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy

In [3]:
packs_raw = pd.read_csv(r'data/packs.tsv', sep='\t', header=None, names=['pack_name', 'brand_all', 'piece_count_all', 'difficulty_all'])

In [4]:
packs_raw.head()

Unnamed: 0,pack_name,brand_all,piece_count_all,difficulty_all
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,387242,"Average,Average"
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,320160,"Average,Average"
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,150170,"Average,Average"
3,Anthology Puzzles Framed American Gothic Antho...,,320300,"Average,Average"
4,Anthology Puzzles Over The Moon Anthology Puzz...,,278177,"Hard,A-Easy"


In [5]:
packs_raw.dtypes

pack_name          object
brand_all          object
piece_count_all    object
difficulty_all     object
dtype: object

In [6]:
packs_raw[packs_raw['pack_name'].isna()]

Unnamed: 0,pack_name,brand_all,piece_count_all,difficulty_all


In [7]:
packs_raw['piece_count_all'].str.split(',', n=1, expand=True)

Unnamed: 0,0,1
0,387,242
1,320,160
2,150,170
3,320,300
4,278,177
...,...,...
904,364,207
905,442,170
906,262,314
907,158,205


Separate rows into

{
    pack_name,
    brand_0,
    brand_1,
    brand_2,
    piece_count_0,
    piece_count_1,
    piece_count_2,
    difficulty_0,
    difficulty_1,
    difficulty_2
}

In [8]:
# Split brands
brands_split = packs_raw['brand_all'].str.split(',', n=1, expand=True).rename(columns={0:'brand_0', 1:'brand_1'})
brands_split.head()

Unnamed: 0,brand_0,brand_1
0,Other-Hand-cut,
1,,
2,,
3,,
4,,


In [9]:
# Split pieces
pieces_split = packs_raw['piece_count_all'].str.split(',', n=1, expand=True).rename(columns={0:'pieces_0', 1:'pieces_1'})
pieces_split.head()

Unnamed: 0,pieces_0,pieces_1
0,387,242
1,320,160
2,150,170
3,320,300
4,278,177


In [10]:
# Split difficulty
diff_split = packs_raw['difficulty_all'].str.split(',', n=1, expand=True).rename(columns={0:'diff_0', 1:'diff_1'})
diff_split.head()

Unnamed: 0,diff_0,diff_1
0,Average,Average
1,Average,Average
2,Average,Average
3,Average,Average
4,Hard,A-Easy


In [11]:
#test = packs_raw['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')


In [12]:
packs_split = packs_raw.join([brands_split, pieces_split, diff_split])
packs_split['num_puzzles'] = packs_raw['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')
packs_split.head()

Unnamed: 0,pack_name,brand_all,piece_count_all,difficulty_all,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,387242,"Average,Average",Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,320160,"Average,Average",,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,150170,"Average,Average",,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,320300,"Average,Average",,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,278177,"Hard,A-Easy",,,278,177,Hard,A-Easy,2


In [13]:
packs_split = packs_split.drop(['brand_all', 'piece_count_all', 'difficulty_all'], axis=1)
packs_split

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278,177,Hard,A-Easy,2
...,...,...,...,...,...,...,...,...
904,Artifact Puzzles Randal Spangler Fireside Fair...,Artifact,,364,207,A-Easy,Average,2
905,Smyth Puzzles Mola Madness TurtleTeasers Peaco...,Other-Hand-cut,,442,170,Really-Hard,A-Easy,2
906,Artifact Puzzles Haeckel Hummingbirds Artifact...,Artifact,,262,314,Average,Hard,2
907,Ecru Puzzles Allen Gilbert Cram Fishermans Cot...,Artifact,Ecru,158,205,Average,Hard,2


In [14]:
pack_puzzles = packs_raw['pack_name']

In [15]:
packs_split[packs_split.pack_name.duplicated(keep=False)]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles


In [16]:
packs_split = packs_split.astype({'pieces_0': 'int64', 'pieces_1': 'int64'}, errors='ignore')

In [17]:
packs_split.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278,177,Hard,A-Easy,2


In [18]:
packs_split.to_csv(r'data/packs_clean.csv')

In [45]:
# Some rows straight up have nothing, drop these as there is nothing to be done
packs_filtered = packs_split.dropna(subset = ['brand_0', 'brand_1', 'pieces_0', 'pieces_1', 'diff_0', 'diff_1'], how='all')
packs_filtered.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278,177,Hard,A-Easy,2


In [46]:
packs_filtered[(packs_filtered.num_puzzles == 1) & (packs_filtered.brand_1.notna() | packs_filtered.pieces_1.notna())]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
45,Artifact Puzzles Roch Urbaniak Floating Town A...,Artifact,,228,216.0,A-Easy,A-Easy,1
77,DaVici Puzzles Awakening DaVici Puzzles,DaVici,,450,280.0,Average,Average,1
99,Davici Puzzles Photo Album Ecru Puzzles Sunny ...,DaVici,Ecru,600,369.0,Average,Average,1
127,Inside Adventure Odilon Redon Butterflies Peac...,Other-Laser-cut,,604,260.0,Average,Average,1
203,Liberty Puzzles Jane Tattersfield Peacock And ...,Liberty,Other-Laser-cut,508,400.0,Average,Average,1
272,Liberty Puzzles Sue Coccia Great Horned Owl Li...,Liberty,,372,510.0,Average,Average,1
376,Palmaris Puzzles Susan Schroder Portal Swift O...,,,470,200.0,Average,Average,1
486,Wentworth Puzzles Fractal Geometry Wentworth P...,Wentworth,,229,504.0,Really-Hard,Average,1
566,Artifact Puzzles Sorolla Valencia Artifact Puz...,Artifact,,150,156.0,A-Easy,Hard,1
606,Nautilus Puzzles Liberty Eugene Delacroix Libe...,Liberty,Nautilus,466,,Hard,Average,1


In [49]:
# Some 1 puzzle packs have a second number for pieces, this seems to be legit, updating to be 2 puzzle packs
packs_filtered.loc[((packs_filtered.num_puzzles == 1) & (packs_filtered.pieces_1.notna())), 'num_puzzles'] = 2

pack_name      Artifact Puzzles Tyukanov Purgatory Artifact P...
brand_0                                                 Artifact
brand_1                                                     None
pieces_0                                                     659
pieces_1                                                      80
diff_0                                                      Hard
diff_1                                                      Hard
num_puzzles                                                    2
Name: 50, dtype: object

In [51]:
packs_filtered[(packs_filtered.pieces_1.isna()) & (packs_filtered.num_puzzles == 2)]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
9,Artifact Puzzles Bee Eater Artifact Puzzles Ke...,Artifact,,439.0,,Hard,A-Easy,2
33,Artifact Puzzles Kristian Adams Snoozies Artif...,Artifact,,413.0,,Average,Average,2
133,Liberty Puzzle Konstatin Gorbatov A Winters Da...,Ecru,Liberty,527.0,,Hard,Average,2
142,Liberty Puzzles Alphonse Mucha La Danse Libert...,Liberty,,258.0,,Average,Hard,2
243,Liberty Puzzles Muktair Oladoja Lovers Ecru Pu...,Ecru,Liberty,504.0,,Average,Average,2
248,Liberty Puzzles Paris Air France Ecru Puzzles ...,Ecru,Liberty,483.0,,Hard,A-Easy,2
389,Snowflake Puzzles Colorful Twirly-ques Ecru Pu...,Ecru,Other-Hand-cut,500.0,,Average,A-Easy,2
427,Stave Puzzles My Special Tea Wentworth Puzzles...,Stave,Wentworth,75.0,,Really-Hard,A-Easy,2
440,Turtle Teasers The Paradise of the Medicine Bu...,,,158.0,,Average,Average,2
456,Wentworth Puzzles Beautiful Victorian Day Whim...,Wentworth,Other-Laser-cut,450.0,,Average,Average,2


In [59]:
packs_filtered.loc[672]['pack_name']

'Artifact Puzzles Allie Sullberg Lets Dance Artifact Puzzles Kevin Sloan Welcome to the Wilderness'

In [60]:
# Could just drop the 2 puzzle rows that don't have full piece info?
packs_filtered_2 = packs_filtered[~((packs_filtered.num_puzzles == 2) & (packs_filtered.pieces_1.isna()))]

In [64]:
packs_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 821 entries, 0 to 908
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pack_name    821 non-null    object
 1   brand_0      792 non-null    object
 2   brand_1      112 non-null    object
 3   pieces_0     817 non-null    object
 4   pieces_1     654 non-null    object
 5   diff_0       821 non-null    object
 6   diff_1       821 non-null    object
 7   num_puzzles  821 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 90.0+ KB


In [63]:
packs_filtered_2

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278,177,Hard,A-Easy,2
...,...,...,...,...,...,...,...,...
904,Artifact Puzzles Randal Spangler Fireside Fair...,Artifact,,364,207,A-Easy,Average,2
905,Smyth Puzzles Mola Madness TurtleTeasers Peaco...,Other-Hand-cut,,442,170,Really-Hard,A-Easy,2
906,Artifact Puzzles Haeckel Hummingbirds Artifact...,Artifact,,262,314,Average,Hard,2
907,Ecru Puzzles Allen Gilbert Cram Fishermans Cot...,Artifact,Ecru,158,205,Average,Hard,2


In [68]:
packs_filtered_2.to_csv(r'data/packs_cleaned_dropna.csv')