In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [69]:
# Read cleaned data
packs_team = pd.read_csv(r'puzzle-scheduling/EDA/packs_df.csv', index_col=0)
packs_mine = pd.read_csv(r'data/packs_clean.csv', index_col=0)
members = pd.read_csv(r'puzzle-scheduling/EDA/member_df.csv', index_col=0)

In [70]:
packs_team.head()

Unnamed: 0,pack_name,brand,piece_count,difficulty_rating
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,387.0,Average
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,320.0,Average
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,150.0,Average
3,Anthology Puzzles Framed American Gothic Antho...,,320.0,Average
4,Anthology Puzzles Over The Moon Anthology Puzz...,,278.0,Hard


In [71]:
packs_mine.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387.0,242.0,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320.0,160.0,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150.0,170.0,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320.0,300.0,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278.0,177.0,Hard,A-Easy,2


In [72]:
members.head()

Unnamed: 0,memberID,holdtime,puzzlepack,num_puzzles,brand
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...,2,Artifact
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...,2,DaVici
2,member1,10.865032,DaVici Puzzles Flying Frigate DaVici Puzzles H...,2,DaVici
3,member1,22.083971,Liberty Puzzles Haeckel Hummingbirds Nautilus ...,2,Liberty
4,member1,5.077603,DaVici Puzzles Diana Zimens City Of Cats,1,DaVici


## Pack Features

Can packs be split?

    1. If packs cannot be split then no point considering the two puzzles as separate, create synthetic features for the pack
    2. Alternatively could break things down to 'puzzle' level, but increases search space possibly needlessly if packs cannot be split
    
    
Per pack
    1. Cummulative pieces (maybe map pieces to difficulty, or have synth weight)
    2. Cummulative difficulty 
    3. Number of puzzles in pack
    4. Brand?

In [73]:
packs_mine[packs_mine['diff_0'] == 'nan']

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles


In [74]:
# Create synthetic pack level features combining information about the two possible puzzles in the pack

difficulty_mapping = {'A-Easy': 1, 'Average': 2, 'Hard': 3, 'Really-Hard': 4}

packs_mine['diff_0'] = packs_mine['diff_0'].map(lambda x: difficulty_mapping[x], na_action='ignore')
packs_mine['diff_1'] = packs_mine['diff_1'].map(lambda x: difficulty_mapping[x], na_action='ignore')

In [75]:
packs_mine.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387.0,242.0,2.0,2.0,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320.0,160.0,2.0,2.0,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150.0,170.0,2.0,2.0,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320.0,300.0,2.0,2.0,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278.0,177.0,3.0,1.0,2


## These packs have more information than there are puzzles, this is weird

In [76]:
# No piece information
packs_mine[(packs_mine.pieces_1.notna() | packs_mine.diff_1.notna()) & (packs_mine.num_puzzles == 1)]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
10,Artifact Puzzles Bek Cruddace Europe Map,Artifact,,900.0,,2.0,2.0,1
45,Artifact Puzzles Roch Urbaniak Floating Town A...,Artifact,,228.0,216.0,1.0,1.0,1
46,Artifact Puzzles Sandi Rigby Botanical Dreaming,Artifact,,273.0,,3.0,2.0,1
77,DaVici Puzzles Awakening DaVici Puzzles,DaVici,,450.0,280.0,2.0,2.0,1
90,DaVici Puzzles Under the Red Umbrella,DaVici,,600.0,,3.0,2.0,1
...,...,...,...,...,...,...,...,...
846,Artifact Puzzles Seurat Grande Jatte Artifact ...,Artifact,,428.0,338.0,4.0,3.0,1
864,DaVici Puzzles Diana Zimens City Of Cats,DaVici,,700.0,,2.0,2.0,1
868,Artifact Puzzles Jethro Buck Wild Things,Artifact,,539.0,,3.0,2.0,1
874,Ecru Puzzles William Penhallow Henderson Lucer...,Artifact,Ecru,385.0,411.0,1.0,1.0,1


In [77]:
packs_raw = pd.read_csv('data/packs.tsv', sep='\t', header=None)
packs_raw.head()

Unnamed: 0,0,1,2,3
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,387242,"Average,Average"
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,320160,"Average,Average"
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,150170,"Average,Average"
3,Anthology Puzzles Framed American Gothic Antho...,,320300,"Average,Average"
4,Anthology Puzzles Over The Moon Anthology Puzz...,,278177,"Hard,A-Easy"


In [78]:
weirdos = packs_raw[(packs_mine.pieces_0.notna()) & (packs_mine.pieces_1.isna()) & (packs_mine.num_puzzles == 2)]
weirdos.head()
weirdos.to_csv(r'data/two_puzzles_no_second_piece_count.csv', index=False)

In [79]:
# Packs that are missing only one of piece/difficulty for the second puzzle
packs_mine[((packs_mine.pieces_1.notna() & packs_mine.diff_1.isna()) | (packs_mine.pieces_1.isna() & packs_mine.diff_1.notna())) & (packs_mine.num_puzzles == 2)]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
9,Artifact Puzzles Bee Eater Artifact Puzzles Ke...,Artifact,,439.0,,3.0,1.0,2
33,Artifact Puzzles Kristian Adams Snoozies Artif...,Artifact,,413.0,,2.0,2.0,2
133,Liberty Puzzle Konstatin Gorbatov A Winters Da...,Ecru,Liberty,527.0,,3.0,2.0,2
142,Liberty Puzzles Alphonse Mucha La Danse Libert...,Liberty,,258.0,,2.0,3.0,2
243,Liberty Puzzles Muktair Oladoja Lovers Ecru Pu...,Ecru,Liberty,504.0,,2.0,2.0,2
248,Liberty Puzzles Paris Air France Ecru Puzzles ...,Ecru,Liberty,483.0,,3.0,1.0,2
389,Snowflake Puzzles Colorful Twirly-ques Ecru Pu...,Ecru,Other-Hand-cut,500.0,,2.0,1.0,2
427,Stave Puzzles My Special Tea Wentworth Puzzles...,Stave,Wentworth,75.0,,4.0,1.0,2
440,Turtle Teasers The Paradise of the Medicine Bu...,,,158.0,,2.0,2.0,2
456,Wentworth Puzzles Beautiful Victorian Day Whim...,Wentworth,Other-Laser-cut,450.0,,2.0,2.0,2


In [80]:
# Packs that claim 2 puzzles but have no info on second, but do have info on the first
packs_mine[(packs_mine.pieces_1.isna() & packs_mine.diff_1.isna()) & (packs_mine.num_puzzles == 2) & (packs_mine.pieces_0.notna() | packs_mine.diff_0.notna())]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles


In [81]:
first_puzzle = packs_mine[['pieces_0', 'diff_0']]
second_puzzle = packs_mine[['pieces_1', 'diff_1']].rename({'pieces_1': 'pieces_0', 'diff_1': 'diff_0'}, axis=1)
puzzles_stacked = pd.concat([first_puzzle, second_puzzle], ignore_index=True).dropna()
puzzles_stacked.head()

Unnamed: 0,pieces_0,diff_0
0,387.0,2.0
1,320.0,2.0
2,150.0,2.0
3,320.0,2.0
4,278.0,3.0


In [82]:
single_puzzle_packs = packs_mine[packs_mine.num_puzzles == 1].dropna(subset=['pieces_0', 'diff_0'])
two_puzzle_packs = packs_mine[packs_mine.num_puzzles == 2].dropna(subset=['pieces_0', 'diff_0', 'pieces_1', 'diff_1'])

In [83]:
valid_packs = pd.concat([single_puzzle_packs, two_puzzle_packs])
valid_packs.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
10,Artifact Puzzles Bek Cruddace Europe Map,Artifact,,900.0,,2.0,2.0,1
45,Artifact Puzzles Roch Urbaniak Floating Town A...,Artifact,,228.0,216.0,1.0,1.0,1
46,Artifact Puzzles Sandi Rigby Botanical Dreaming,Artifact,,273.0,,3.0,2.0,1
77,DaVici Puzzles Awakening DaVici Puzzles,DaVici,,450.0,280.0,2.0,2.0,1
90,DaVici Puzzles Under the Red Umbrella,DaVici,,600.0,,3.0,2.0,1


In [84]:
pieces_by_difficulty = []

for i, row in valid_packs.iterrows():
    out = {
        'name': row['pack_name'],
        'pieces_d1': 0,
        'pieces_d2': 0,
        'pieces_d3': 0,
        'pieces_d4': 0,
        'num_puzzles': row['num_puzzles']
    }
    
    out[f'pieces_d{str(int(row["diff_0"]))}'] = row['pieces_0']
    
    if(row['num_puzzles'] == 2):
        out[f'pieces_d{str(int(row["diff_1"]))}'] += row['pieces_1']
        
    pieces_by_difficulty.append(out)


In [85]:
pbyd = pd.DataFrame(pieces_by_difficulty)
pbyd.head()

Unnamed: 0,name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles
0,Artifact Puzzles Bek Cruddace Europe Map,0.0,900.0,0.0,0.0,1
1,Artifact Puzzles Roch Urbaniak Floating Town A...,228.0,0.0,0.0,0.0,1
2,Artifact Puzzles Sandi Rigby Botanical Dreaming,0.0,0.0,273.0,0.0,1
3,DaVici Puzzles Awakening DaVici Puzzles,0.0,450.0,0.0,0.0,1
4,DaVici Puzzles Under the Red Umbrella,0.0,0.0,600.0,0.0,1


In [86]:
valid_packs[valid_packs.brand_1.notna()]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
99,Davici Puzzles Photo Album Ecru Puzzles Sunny ...,DaVici,Ecru,600.0,369.0,2.0,2.0,1
203,Liberty Puzzles Jane Tattersfield Peacock And ...,Liberty,Other-Laser-cut,508.0,400.0,2.0,2.0,1
606,Nautilus Puzzles Liberty Eugene Delacroix Libe...,Liberty,Nautilus,466.0,,3.0,2.0,1
651,Artifact Puzzles Iwona Lifsches Santa Claus De...,Artifact,Wentworth,158.0,250.0,1.0,3.0,1
874,Ecru Puzzles William Penhallow Henderson Lucer...,Artifact,Ecru,385.0,411.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...
863,Ecru Puzzles William Penhallow Henderson Lucer...,Artifact,Ecru,385.0,482.0,3.0,2.0,2
865,Nikolyaaa Red Boats Wheelgiant Balancing Bould...,Other-Hand-cut,Other-Laser-cut,249.0,425.0,2.0,2.0,2
871,Ecru Puzzles Rachell Sumpter Wild Moor Artifac...,Artifact,Ecru,260.0,338.0,3.0,2.0,2
884,Ecru Puzzles Jonik Sunny November Artifact Puz...,Artifact,Ecru,369.0,311.0,2.0,2.0,2


In [87]:
packs_dropped = pd.read_csv(r'data/packs_cleaned_dropna.csv', index_col=0)
packs_dropped.head()

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387.0,242.0,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320.0,160.0,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150.0,170.0,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320.0,300.0,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278.0,177.0,Hard,A-Easy,2


In [88]:
# Create synthetic pack level features combining information about the two possible puzzles in the pack

difficulty_mapping = {'A-Easy': 1, 'Average': 2, 'Hard': 3, 'Really-Hard': 4}

packs_dropped['diff_0'] = packs_dropped['diff_0'].map(lambda x: difficulty_mapping[x], na_action='ignore')
packs_dropped['diff_1'] = packs_dropped['diff_1'].map(lambda x: difficulty_mapping[x], na_action='ignore')

In [89]:
pieces_by_difficulty = []

for i, row in packs_dropped.iterrows():
    out = {
        'name': row['pack_name'],
        'pieces_d1': 0,
        'pieces_d2': 0,
        'pieces_d3': 0,
        'pieces_d4': 0,
        'num_puzzles': row['num_puzzles']
    }
    
    if(row['pieces_0'].notna()]:
        out[f'pieces_d{str(int(row["diff_0"]))}'] += row['pieces_0']
    
    if(row['num_puzzles'] == 2 and row['pieces_1'].notna()):
        out[f'pieces_d{str(int(row["diff_1"]))}'] += row['pieces_1']
        
    pieces_by_difficulty.append(out)

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (2778832469.py, line 13)

In [90]:
pieces_df = pd.DataFrame(pieces_by_difficulty)
pieces_df.head()

Unnamed: 0,name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles
0,Artifact Puzzles Bek Cruddace Europe Map,0.0,900.0,0.0,0.0,1
1,Artifact Puzzles Roch Urbaniak Floating Town A...,228.0,0.0,0.0,0.0,1
2,Artifact Puzzles Sandi Rigby Botanical Dreaming,0.0,0.0,273.0,0.0,1
3,DaVici Puzzles Awakening DaVici Puzzles,0.0,450.0,0.0,0.0,1
4,DaVici Puzzles Under the Red Umbrella,0.0,0.0,600.0,0.0,1


In [91]:
pieces_df.to_csv(r'data/pieces_by_puzzle_and_difficulty.csv')

In [92]:
packs_dropped[packs_dropped.pieces_1.isna() | packs_dropped.pieces_0.isna()]

Unnamed: 0,pack_name,brand_0,brand_1,pieces_0,pieces_1,diff_0,diff_1,num_puzzles
10,Artifact Puzzles Bek Cruddace Europe Map,Artifact,,900.0,,2,2,1
46,Artifact Puzzles Sandi Rigby Botanical Dreaming,Artifact,,273.0,,3,2,1
90,DaVici Puzzles Under the Red Umbrella,DaVici,,600.0,,3,2,1
92,DaVici Red Book,DaVici,,700.0,,3,2,1
95,Davici Puzzles Claude Monet Woman in the Garden,DaVici,,552.0,,1,2,1
...,...,...,...,...,...,...,...,...
789,Artifact Puzzles Bokuyo Forest,Artifact,,689.0,,4,2,1
808,Artifact Puzzles Hieronymus Bosch Garden Of Ea...,Artifact,,529.0,,2,2,1
864,DaVici Puzzles Diana Zimens City Of Cats,DaVici,,700.0,,2,2,1
868,Artifact Puzzles Jethro Buck Wild Things,Artifact,,539.0,,3,2,1


In [93]:
pieces_df[pieces_df.isna().any(axis=1)]

Unnamed: 0,name,pieces_d1,pieces_d2,pieces_d3,pieces_d4,num_puzzles
