In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy

## Puzzles Data Cleaning


> Part 1. Member Info Data Clean

**1. member_hold_times_and_packs.tsv**
* `memberID` - Unique identifiers for each person
* `holdtime` - number of days the person had the pack
* `puzzlepack` - name of the pack (may have more than 1 puzzle per pack)

In [2]:
# Import and preview members data
member_df = pd.read_csv("data/member_hold_times_and_packs.tsv", sep='\t', header=None, names=["member", 'hold_time', 'pack_name'])
member_df.head()

Unnamed: 0,member,hold_time,pack_name
0,member1,2.939411,Artifact Puzzles Justin Hillgrove Word Travels...
1,member1,0.998885,DaVici Puzzles Full Moon Feast DaVici Puzzles ...
2,member1,10.865032,DaVici Puzzles Flying Frigate DaVici Puzzles H...
3,member1,22.083971,Liberty Puzzles Haeckel Hummingbirds Nautilus ...
4,member1,5.077603,DaVici Puzzles Diana Zimens City Of Cats


In [3]:
# Number of null values
member_df[member_df['pack_name'].isna()]
member_df.isnull().sum()

member       0
hold_time    0
pack_name    0
dtype: int64

- No missing values in the data
- We have information about 675 members and 910 unique puzzle packs

#### Remove outliers

- Hold times $< 0.1$ are an artifact of their DB def wrong
- Hold times $> 200$ days could be real but are probably noise

In [4]:
print("Hold times < 0.1 |", len(member_df[member_df.hold_time < 0.1]))
print("Hold times > 200 |", len(member_df[member_df.hold_time > 200]))
print("Hold times Total |", len(member_df))

Hold times < 0.1 | 100
Hold times > 200 | 114
Hold times Total | 18141


In [5]:
member = member_df[(member_df.hold_time >= 0.1) & (member_df.hold_time <= 200)]
member.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17927 entries, 0 to 18140
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   member     17927 non-null  object 
 1   hold_time  17927 non-null  float64
 2   pack_name  17927 non-null  object 
dtypes: float64(1), object(2)
memory usage: 560.2+ KB


In [6]:
# Save data to csv file
member.to_csv("data/member_cleaned.csv", index = False)

#### Thoughts:

- Without the dates the members got the packs, we can only rely on overall dist of user behavior
- Signal such as seasonality and trends in user behavior are lost
- We can't forcast "number of packs in $N$ days" with this info
- From this info we can at best predict "expected hold time for user $X$ and pack $Y$"

----------------------------------------------------

## Packs Data Cleaning

> Part 2. Initial Packs Data Clean


**2. packs.tsv**
* `pack_name` - name of the pack (may have more than 1 puzzle per pack)
* `brand` - brand name of puzzle pack
* `piece_count` - number of pieces in puzzle
* `difficulty_rating` - difficulty rating for puzzle

In [7]:
# Import and preview packs data
col_names = ['pack_name', 'brand_all', 'piece_count_all', 'difficulty_all']
packs_df = pd.read_csv("data/packs_Jan14_better.tsv", sep= "\t", header = None, names = col_names)
packs_df.head()

Unnamed: 0,pack_name,brand_all,piece_count_all,difficulty_all
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,387242,"Average,Average"
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,320160,"Average,Average"
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,150170,"Average,Average"
3,Anthology Puzzles Framed American Gothic Antho...,,320300,"Average,Average"
4,Anthology Puzzles Over The Moon Anthology Puzz...,,278177,"Hard,A-Easy"


In [8]:
# Number of null values
packs_df.isnull().sum()

pack_name            0
brand_all          108
piece_count_all     82
difficulty_all      80
dtype: int64

Separate columns into 
- `pack_name`, `brand_0`, `brand_1`, `piece_count_0`, `piece_count_1`, `difficulty_0`, `difficulty_1`

In [9]:
# Split brands
brands_split = packs_df['brand_all'].str.split(',', n=1, expand=True).rename(columns={0:'brand_0', 1:'brand_1'})

# Split pieces
piece_count_split = packs_df['piece_count_all'].str.split(',', n=1, expand=True).rename(columns={0:'piece_count_0', 1:'piece_count_1'})

# Split difficulty
diff_split = packs_df['difficulty_all'].str.split(',', n=1, expand=True).rename(columns={0:'diff_0', 1:'diff_1'})

Add separated columns to dataframe

In [10]:
# load data into a DataFrame object:
packs_split = packs_df.join([brands_split, piece_count_split, diff_split])

# Drop columns
packs_split = packs_split.drop(['brand_all', 'piece_count_all', 'difficulty_all'], axis=1)

# Get number of puzzles in pack
packs_split['num_puzzles'] = packs_df['pack_name'].map(lambda n: 1 if (n[-1] == ' ') else 2, na_action='ignore')
packs_split.head()

Unnamed: 0,pack_name,brand_0,brand_1,piece_count_0,piece_count_1,diff_0,diff_1,num_puzzles
0,Anne Belle Thompson The Mikado Anne Belle Thom...,Other-Hand-cut,,387,242,Average,Average,2
1,Anthology Puzzles Alphonse Mucha La Plume Anth...,,,320,160,Average,Average,2
2,Anthology Puzzles Colorful Cat Anthology Puzzl...,,,150,170,Average,Average,2
3,Anthology Puzzles Framed American Gothic Antho...,,,320,300,Average,Average,2
4,Anthology Puzzles Over The Moon Anthology Puzz...,,,278,177,Hard,A-Easy,2


In [11]:
# Save cleaned dataframe to csv file
packs_split[packs_split['pack_name'].duplicated(keep = False)]
packs_split.to_csv("data/packs_cleaned.csv")

> Part 3. Remove NA Values

In [12]:
# Find and count missing values
na_names = packs_split[packs_split['pack_name'].isna()]
na_brand_0 = packs_split[packs_split['brand_0'].isna()]
na_brand_1 = packs_split[packs_split['brand_1'].isna()]
na_piece_0 = packs_split[packs_split['piece_count_0'].isna()]
na_piece_1 = packs_split[packs_split['piece_count_1'].isna()]
na_diff_0 = packs_split[packs_split['diff_0'].isna()]
na_diff_1 = packs_split[packs_split['diff_1'].isna()]

print("NA pack name:", len(na_names), 
      "\nNA brand 0:", len(na_brand_0), "| NA brand 1:", len(na_brand_1),
      "\nNA piece count 0:", len(na_piece_0), "| NA piece count 1:", len(na_piece_1), 
      "\nNA difficulty:", len(na_diff_0), "| NA difficulty:", len(na_diff_1))

NA pack name: 0 
NA brand 0: 108 | NA brand 1: 797 
NA piece count 0: 82 | NA piece count 1: 241 
NA difficulty: 80 | NA difficulty: 82


In [13]:
# Convert piece_count values to integers
packs_split = packs_split.astype({'piece_count_0': 'int64', 'piece_count_1': 'int64'}, errors='ignore')

In [14]:
# Drop rows that have no data across all columns
packs_filtered = packs_split.dropna(subset = ['brand_0', 'brand_1', 'piece_count_0', 'piece_count_1', 'diff_0', 'diff_1'], how='all')

Some single puzzle packs have a 2 puzzle piece counts -- update to 2 puzzle packs


In [15]:
# Some 1 puzzle packs have a second number for pieces, this seems to be legit, updating to be 2 puzzle packs
packs_filtered.loc[((packs_filtered['num_puzzles'] == 1) & 
                    (packs_filtered['piece_count_1'].notna())), 'num_puzzles'] = 2

In [16]:
# Could just drop the 2 puzzle rows that don't have full piece info?
packs_filtered_2 = packs_filtered[~(packs_filtered['num_puzzles'] == 2 & 
                                    packs_filtered['piece_count_1'].isna())]


In [17]:
# packs_filtered_2.to_csv("data/packs_cleaned_dropna.csv")
packs_filtered_2.to_csv("data/packs_cleaned_dropna.csv",index = False)