# Bigger Dataset - EDA 3: NextDayKoi & SacramentoKoi

In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# import data

nextdaykoi = pd.read_csv('../data/nextdaykoi_data.csv')
sacramentokoi = pd.read_csv('../data/sacramentokoi_data.csv')

# Data from NextDayKoi - 1948 pictures

In [3]:
# check the info

nextdaykoi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1948 entries, 0 to 1947
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1948 non-null   object
 1   link        1948 non-null   object
 2   title       1948 non-null   object
 3   category    1948 non-null   object
 4   price       1948 non-null   object
 5   sku         1910 non-null   object
 6   image       1948 non-null   object
 7   image_url   1948 non-null   object
dtypes: object(8)
memory usage: 121.9+ KB


In [4]:
# check the data

nextdaykoi.head()

Unnamed: 0.1,Unnamed: 0,link,title,category,price,sku,image,image_url
0,02f31ccb,https://nextdaykoi.com/product/3-5-imported-sh...,3.5” Imported Shusui,"['Asagi and Shusui', 'Imported Koi', 'Koi', 'S...",$30.00,HS0505G01515C14,nextdaykoi_data/02f31ccb_3.5”_Imported_Shusui.jpg,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...
1,bec4495e,https://nextdaykoi.com/product/3-5-imported-ko...,3.5” Imported Kohaku,"['Gosanke', 'Imported Koi', 'Kohaku', 'Koi', '...",$30.00,HS0505G01515C13,nextdaykoi_data/bec4495e_3.5”_Imported_Kohaku.jpg,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...
2,62543bb1,https://nextdaykoi.com/product/3-5-imported-ar...,3.5” Imported Armor Scaled Hariwake,"['All Hikari Koi', 'Hariwake', 'Hikarimoyo', '...",$30.00,HS0505G01515C11,nextdaykoi_data/62543bb1_3.5”_Imported_Armor_S...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...
3,ea7ff9e9,https://nextdaykoi.com/product/3-5-imported-be...,3.5” Imported Doitsu Aka Bekko,"['Aka Bekko', 'Bekko', 'Imported Koi', 'Koi', ...",$30.00,HS0505G01515C10,nextdaykoi_data/ea7ff9e9_3.5”_Imported_Doitsu_...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...
4,f6ec77bc,https://nextdaykoi.com/product/3-5-imported-gi...,3.5” Imported Gin Rin Hariwake,"['All Hikari Koi', 'Hariwake', 'Hikarimoyo', '...",$30.00,HS0505G01515C08,nextdaykoi_data/f6ec77bc_3.5”_Imported_Gin_Rin...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...


In [7]:
# check the titles vs the categories

for index, row in nextdaykoi.head(30).iterrows():
  print(index)
  print(row['title'])
  print(row['category'])

0
3.5” Imported Shusui
['Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi']
1
3.5” Imported Kohaku
['Gosanke', 'Imported Koi', 'Kohaku', 'Koi', 'Single Fish', 'Single Koi']
2
3.5” Imported Armor Scaled Hariwake
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
3
3.5” Imported Doitsu Aka Bekko
['Aka Bekko', 'Bekko', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
4
3.5” Imported Gin Rin Hariwake
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
5
3” Imported Gin Rin Hariwake
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
6
3” Imported Doitsu Gin Shiro Bekko
['Bekko', 'Imported Koi', 'Koi', 'Shiro Bekko', 'Single Fish', 'Single Koi']
7
3” Imported Kikusui
['All Hikari Koi', 'Hikarimoyo', 'Imported Koi', 'Kikusui', 'Koi', 'Single Fish', 'Single Koi']
8
3” Imported Gin Rin Kujaku
['Hikarimoyo', 'Imported Koi', 'Koi

- get the length_inches from the title
- get the gender from the title, if present
- the title has information that is not in the category, so get all words other than 'Imported' and 'Koi'

In [19]:
# create a function to check if the first word is a number, if so, return it in new feature length_inches

def get_length(string):
  if string.split()[0][0].isdigit():
    return string.split()[0]

In [20]:
# apply the function to the whole dataset

nextdaykoi['length_inches'] = nextdaykoi['title'].apply(get_length)
nextdaykoi['length_inches']

0       3.5”
1       3.5”
2       3.5”
3       3.5”
4       3.5”
        ... 
1943    5-6”
1944    None
1945    5-6”
1946    2-3″
1947    2-3″
Name: length_inches, Length: 1948, dtype: object

In [27]:
# check how many are nulls

nextdaykoi['length_inches'].isna().sum()

np.int64(45)

In [31]:
# extract the gender if present

for word in nextdaykoi['title'][20].split():
  if word.lower() in ['male', 'female']:
    print(word)

Male


In [32]:
# create a function to extract the gender into a new column

def get_gender(string):
  for word in string.split():
    if word.lower() in ['male', 'female']:
      return word

In [33]:
# apply the function to the dataset

nextdaykoi['gender'] = nextdaykoi['title'].apply(get_gender)
nextdaykoi['gender']

0       None
1       None
2       None
3       None
4       None
        ... 
1943    None
1944    None
1945    None
1946    None
1947    None
Name: gender, Length: 1948, dtype: object

In [34]:
# check value_counts

nextdaykoi['gender'].value_counts()

gender
Male      133
Female     16
Name: count, dtype: int64

- there is not much data, this feature may need to be dropped later

In [47]:
nextdaykoi['title'][100]

'4.5” Imported Doitsu Blue Kujaku'

In [52]:
# extract other words as tags (words that are not gender-related, "imported", "lot", "of" or "koi" and starts with a alphabet letter)

words_to_ignore = ['imported', 'lot', 'of', 'koi', 'assorted', 'and']

tags_list = []
for word in nextdaykoi['title'][100].split():
  if word.lower() not in words_to_ignore and word[0].isalpha():
    tags_list.append(word)

tags_list

['Doitsu', 'Blue', 'Kujaku']

In [53]:
# create a function to extract the tags

def get_tags_from_title(string):
  words_to_ignore = ['imported', 'lot', 'of', 'koi', 'assorted', 'and']
  tags_list = []
  for word in string.split():
    if word.lower() not in words_to_ignore and word[0].isalpha():
      tags_list.append(word)
  return tags_list

In [54]:
# apply the function to the dataset

nextdaykoi['tags_from_title'] = nextdaykoi['title'].apply(get_tags_from_title)
nextdaykoi['tags_from_title']

0                         [Shusui]
1                         [Kohaku]
2        [Armor, Scaled, Hariwake]
3             [Doitsu, Aka, Bekko]
4             [Gin, Rin, Hariwake]
                   ...            
1943               [Goldfish, Mix]
1944           [AA, Grade, Grade)]
1945    [Apricot, Comet, Goldfish]
1946     [Mixed, Oranda, Goldfish]
1947    [Mixed, Fantail, Goldfish]
Name: tags_from_title, Length: 1948, dtype: object

In [57]:
# join the category feature with the tags_from_title to create the final tags feature
# check the type of the data

type(nextdaykoi['category'][0])

str

In [58]:
# the category is a string that is a list of strings, turn it back into a list of strings

nextdaykoi['category'][0]

"['Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi']"

In [62]:
# the category is a string that is a list of strings, turn it back into a list of strings
# strip the double quotes

print(nextdaykoi['category'][0].strip('"'))

['Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi']


In [63]:
# strip the brackets

print(nextdaykoi['category'][0].strip('"').strip('[]'))

'Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi'


In [64]:
# split by ", "

print(nextdaykoi['category'][0].strip('"').strip('[]').split(", "))

["'Asagi and Shusui'", "'Imported Koi'", "'Koi'", "'Shusui'", "'Single Fish'", "'Single Koi'"]


In [66]:
# strip the single quotes

print([item.strip("'") for item in nextdaykoi['category'][0].strip('"').strip('[]').split(", ")])

['Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi']


In [67]:
# create a function to turn the string of list of strings into a list of strings

def string_to_list(string):
  return [item.strip("'") for item in string.strip('"').strip('[]').split(", ")]

In [68]:
# apply the function to the dataset

nextdaykoi['tags_from_category'] = nextdaykoi['category'].apply(string_to_list)
nextdaykoi['tags_from_category']

0       [Asagi and Shusui, Imported Koi, Koi, Shusui, ...
1       [Gosanke, Imported Koi, Kohaku, Koi, Single Fi...
2       [All Hikari Koi, Hariwake, Hikarimoyo, Importe...
3       [Aka Bekko, Bekko, Imported Koi, Koi, Single F...
4       [All Hikari Koi, Hariwake, Hikarimoyo, Importe...
                              ...                        
1943    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1944    [Koi Pond Packs, Pond Pack Koi, Butterfly Koi ...
1945    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1946    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1947    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
Name: tags_from_category, Length: 1948, dtype: object

In [69]:
# double check the type

type(nextdaykoi['tags_from_category'][0])

list

In [70]:
# combine the two lists of tags into a final tags column

nextdaykoi['tags'] = nextdaykoi['tags_from_title'] = nextdaykoi['tags_from_category']
nextdaykoi['tags']

0       [Asagi and Shusui, Imported Koi, Koi, Shusui, ...
1       [Gosanke, Imported Koi, Kohaku, Koi, Single Fi...
2       [All Hikari Koi, Hariwake, Hikarimoyo, Importe...
3       [Aka Bekko, Bekko, Imported Koi, Koi, Single F...
4       [All Hikari Koi, Hariwake, Hikarimoyo, Importe...
                              ...                        
1943    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1944    [Koi Pond Packs, Pond Pack Koi, Butterfly Koi ...
1945    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1946    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
1947    [Goldfish Pond Packs, Pond Pack Koi, Butterfly...
Name: tags, Length: 1948, dtype: object

In [72]:
for _, row in nextdaykoi.iterrows():
  print(row['tags'])

['Asagi and Shusui', 'Imported Koi', 'Koi', 'Shusui', 'Single Fish', 'Single Koi']
['Gosanke', 'Imported Koi', 'Kohaku', 'Koi', 'Single Fish', 'Single Koi']
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
['Aka Bekko', 'Bekko', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
['All Hikari Koi', 'Hariwake', 'Hikarimoyo', 'Imported Koi', 'Koi', 'Single Fish', 'Single Koi']
['Bekko', 'Imported Koi', 'Koi', 'Shiro Bekko', 'Single Fish', 'Single Koi']
['All Hikari Koi', 'Hikarimoyo', 'Imported Koi', 'Kikusui', 'Koi', 'Single Fish', 'Single Koi']
['Hikarimoyo', 'Imported Koi', 'Koi', 'Kujaku', 'Single Fish', 'Single Koi']
['EFP Goldfish Lots', 'Goldfish For Sale', 'Imported Goldfish', 'Oranda']
['EFP Goldfish Lots', 'Goldfish For Sale', 'Imported Goldfish', 'Oranda']
['EFP Goldfish Lots', 'Goldfish For Sale', 'Imported Goldfish', 'Oranda']
['EFP

In [75]:
# rename 'Unnamed: 0' column to 'uuid

nextdaykoi = nextdaykoi.rename(columns={'Unnamed: 0': 'uuid'})
nextdaykoi.head()

Unnamed: 0,uuid,link,title,category,price,sku,image,image_url,length_inches,gender,tags_from_title,tags_from_category,tags
0,02f31ccb,https://nextdaykoi.com/product/3-5-imported-sh...,3.5” Imported Shusui,"['Asagi and Shusui', 'Imported Koi', 'Koi', 'S...",$30.00,HS0505G01515C14,nextdaykoi_data/02f31ccb_3.5”_Imported_Shusui.jpg,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...,3.5”,,"[Asagi and Shusui, Imported Koi, Koi, Shusui, ...","[Asagi and Shusui, Imported Koi, Koi, Shusui, ...","[Asagi and Shusui, Imported Koi, Koi, Shusui, ..."
1,bec4495e,https://nextdaykoi.com/product/3-5-imported-ko...,3.5” Imported Kohaku,"['Gosanke', 'Imported Koi', 'Kohaku', 'Koi', '...",$30.00,HS0505G01515C13,nextdaykoi_data/bec4495e_3.5”_Imported_Kohaku.jpg,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...,3.5”,,"[Gosanke, Imported Koi, Kohaku, Koi, Single Fi...","[Gosanke, Imported Koi, Kohaku, Koi, Single Fi...","[Gosanke, Imported Koi, Kohaku, Koi, Single Fi..."
2,62543bb1,https://nextdaykoi.com/product/3-5-imported-ar...,3.5” Imported Armor Scaled Hariwake,"['All Hikari Koi', 'Hariwake', 'Hikarimoyo', '...",$30.00,HS0505G01515C11,nextdaykoi_data/62543bb1_3.5”_Imported_Armor_S...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...,3.5”,,"[All Hikari Koi, Hariwake, Hikarimoyo, Importe...","[All Hikari Koi, Hariwake, Hikarimoyo, Importe...","[All Hikari Koi, Hariwake, Hikarimoyo, Importe..."
3,ea7ff9e9,https://nextdaykoi.com/product/3-5-imported-be...,3.5” Imported Doitsu Aka Bekko,"['Aka Bekko', 'Bekko', 'Imported Koi', 'Koi', ...",$30.00,HS0505G01515C10,nextdaykoi_data/ea7ff9e9_3.5”_Imported_Doitsu_...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...,3.5”,,"[Aka Bekko, Bekko, Imported Koi, Koi, Single F...","[Aka Bekko, Bekko, Imported Koi, Koi, Single F...","[Aka Bekko, Bekko, Imported Koi, Koi, Single F..."
4,f6ec77bc,https://nextdaykoi.com/product/3-5-imported-gi...,3.5” Imported Gin Rin Hariwake,"['All Hikari Koi', 'Hariwake', 'Hikarimoyo', '...",$30.00,HS0505G01515C08,nextdaykoi_data/f6ec77bc_3.5”_Imported_Gin_Rin...,https://d2e07cbkdk0gwy.cloudfront.net/wp-conte...,3.5”,,"[All Hikari Koi, Hariwake, Hikarimoyo, Importe...","[All Hikari Koi, Hariwake, Hikarimoyo, Importe...","[All Hikari Koi, Hariwake, Hikarimoyo, Importe..."


In [76]:
# export the dataframe to csv

# nextdaykoi.to_csv('nextdaykoi_df.csv', index=False)

# Data from Sacramentokoi - 423 pictures

In [5]:
# check the info

sacramentokoi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    423 non-null    object 
 1   link          423 non-null    object 
 2   title         423 non-null    object 
 3   category      423 non-null    object 
 4   out_of_stock  423 non-null    bool   
 5   price         423 non-null    object 
 6   sku           422 non-null    float64
 7   image         423 non-null    object 
 8   image_url     423 non-null    object 
dtypes: bool(1), float64(1), object(7)
memory usage: 27.0+ KB


In [6]:
# check the data

sacramentokoi.head()

Unnamed: 0.1,Unnamed: 0,link,title,category,out_of_stock,price,sku,image,image_url
0,cd3cf5,https://sacramentokoi.com/ai-goromo-10-240054222/,Ai Goromo 10″ – 240054222,Goromo,True,$150.00,240054222.0,sacramentokoi_data/cd3cf5_Ai_Goromo_10″_–_2400...,https://sacramentokoi.com/wp-content/uploads/2...
1,617c1b,https://sacramentokoi.com/ai-goromo-20-240389171/,Ai Goromo 20″ – 240389171,Female,True,"$1,200.00",240389171.0,sacramentokoi_data/617c1b_Ai_Goromo_20″_–_2403...,https://sacramentokoi.com/wp-content/uploads/2...
2,5027b5,https://sacramentokoi.com/ai-goromo-23-240396257/,Ai Goromo 23″ – 240396257,Female,True,"$1,200.00",240396257.0,sacramentokoi_data/5027b5_Ai_Goromo_23″_–_2403...,https://sacramentokoi.com/wp-content/uploads/2...
3,ebda82,https://sacramentokoi.com/ai-goromo-24-231616208/,Ai Goromo 27″ – 231616208,Female,True,"$4,200.00",231616208.0,sacramentokoi_data/ebda82_Ai_Goromo_27″_–_2316...,https://sacramentokoi.com/wp-content/uploads/2...
4,10edc1,https://sacramentokoi.com/aka-matsuba-11-24005...,Aka Matsuba 11″ – 240054184,Koi,True,$175.00,240054184.0,sacramentokoi_data/10edc1_Aka_Matsuba_11″_–_24...,https://sacramentokoi.com/wp-content/uploads/2...


In [77]:
# check the category types

sacramentokoi['category'].value_counts()

category
Female              156
Koi                  65
Doitsu               45
Gin Rin              44
Asagi                21
Aokiya               20
Chagoi               11
Benigoi              11
Aragoke              10
Goshiki               7
Ikarashi Ozumi        6
Kohaku                5
Ginga                 4
Kaneko                3
Kikokuryu             3
Ikarashi Toshi        2
Kigoi                 2
Fukasawa              2
Karasu                1
Goromo                1
Gin Rin Metallic      1
Doitsu Metallic       1
Aragoke Metallic      1
Bekko                 1
Name: count, dtype: int64

- there is Female label but no Male label

In [78]:
# check more rows of the title

sacramentokoi['title']

0          Ai Goromo 10″ – 240054222
1          Ai Goromo 20″ – 240389171
2          Ai Goromo 23″ – 240396257
3          Ai Goromo 27″ – 231616208
4        Aka Matsuba 11″ – 240054184
                   ...              
418    Yellow Dragon 17″ – 240154405
419     Yellow Ginga 14″ – 240054090
420     Yellow Ginga 19″ – 240489285
421     Yellow Ginga 20″ – 240290281
422                    Koi Pre-Order
Name: title, Length: 423, dtype: object