In [5]:
import pandas as pd
import pdb
from fuzzywuzzy import fuzz

In [4]:
raw_data = pd.read_csv("../data/peppers_20180102.csv")
raw_data.tail()

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,link,source_name
975,Aji Chuncho,,,,,,0,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
976,Aladdin,,,,,,0,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
977,Bell Pepper,,,,,,0,,,https://www.cayennediane.com/peppers/sweet-bel...,Cayenne Diane
978,Italian Sweet Pepper,,,,,,0,,,https://www.cayennediane.com/peppers/italian-s...,Cayenne Diane
979,Melrose Pepper,,,,,,0,,,https://www.cayennediane.com/peppers/melrose-p...,Cayenne Diane


# Deduplicating records
- If SHU values and name matches >70, considered duplicate
- Choose records with fewer NaN values

### Standardize names

In [96]:
sanitized_data = raw_data.copy()

In [100]:
# remove "Pepper" reference and strip whitespace
def remove_pepper_ref(name):
    if name != "Pepperoncini":
        name = name.replace("Peppers", "").replace("Pepper", "")
    return name.strip()

sanitized_data["name"] = sanitized_data["name"].apply(remove_pepper_ref)

### Low hanging fruit: pure name matches

In [102]:
pepper_names = sanitized_data["name"].str.lower().value_counts()
len(pepper_names[pepper_names > 1])/float(len(pepper_names))

0.5084459459459459

In [103]:
dup_names = pepper_names[pepper_names > 1].index
dup_name_records = sanitized_data[sanitized_data["name"].str.lower().isin(dup_names)].sort_values("name")
dup_name_records

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,link,source_name
555,7 Pot Barrackpore,,,,,,1300000,,,https://www.cayennediane.com/peppers/7-pot-bar...,Cayenne Diane
124,7 Pot Barrackpore,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1300000,125.0,520.0,https://www.pepperscale.com/7-pot-barrackpore/,PepperScale
554,7 Pot Brain Strain,,,,,,1350000,,,https://www.cayennediane.com/peppers/7-pot-bra...,Cayenne Diane
123,7 Pot Brain Strain,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1350000,125.0,540.0,https://www.pepperscale.com/7-pot-brain-strain/,PepperScale
565,7 Pot Bubblegum,,,,,,1000000,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
116,7 Pot Bubblegum,chinense,super hot,Europe,United Kingdom,850000.0,1000000,106.0,400.0,https://www.pepperscale.com/7-pot-bubblegum/,PepperScale
549,7 Pot Douglah,,,,,,1853396,,,https://www.cayennediane.com/peppers/7-pot-dou...,Cayenne Diane
119,7 Pot Douglah,chinense,super hot,Central America and the Caribbean,Trinidad,923889.0,1853986,109.0,742.0,https://www.pepperscale.com/7-pot-douglah/,PepperScale
556,7 Pot Jonah,,,,,,1200000,,,https://www.cayennediane.com/peppers/7-pot-jonah/,Cayenne Diane
115,7 Pot Jonah,chinense,super hot,Central America and the Caribbean,Trinidad,800000.0,1200000,100.0,480.0,https://www.pepperscale.com/7-pot-jonah/,PepperScale


In [105]:
def choose_duplicate(duplicates):
    nan_count = duplicates.apply(lambda row: sum(pd.isnull(row)), axis=1)
    fuller_row_index = nan_count.sort_values().index[0]
    return duplicates.loc[fuller_row_index]

prevailing_peppers = pd.DataFrame([choose_duplicate(sanitized_data[sanitized_data["name"].str.lower() == pepper])\
                                   for pepper in dup_names])
prevailing_peppers

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,link,source_name
68,Manzano,pubescens,medium,South America,South America,12000.0,30000,2.0,12.0,http://www.pepperscale.com/manzano-pepper,PepperScale
146,Bahamian,,,,,95000.0,110000,,,https://www.chilliworld.com/factfile/scoville-...,ChiliWorld
90,Thai,annuum,extra hot,Asia,Thailand,50000.0,100000,6.0,40.0,http://www.pepperscale.com/thai-peppers,PepperScale
128,Carolina Reaper,chinense,super hot,North America,United States,1400000.0,2200000,175.0,880.0,http://www.pepperscale.com/carolina-reaper,PepperScale
23,Anaheim,annuum,mild,North America,Mexico,500.0,2500,-16.0,0.0,http://www.pepperscale.com/anaheim-pepper,PepperScale
32,Pasilla,annuum,mild,North America,Mexico,1000.0,2500,-8.0,0.0,http://www.pepperscale.com/pasilla-pepper,PepperScale
29,Poblano,annuum,mild,North America,Mexico,1000.0,1500,-8.0,-2.0,http://www.pepperscale.com/poblano-peppers,PepperScale
17,Cubanelle,annuum,mild,Multi-Region,"Italy, Cuba",100.0,1000,-80.0,-3.0,http://www.pepperscale.com/cubanelle-pepper,PepperScale
28,Ancho,annuum,mild,North America,Mexico,1000.0,1500,-8.0,-2.0,http://www.pepperscale.com/ancho-pepper,PepperScale
78,Tabasco,frutescens,medium,Central America and the Caribbean,Costa Rica,30000.0,50000,4.0,20.0,http://www.pepperscale.com/tabasco-pepper,PepperScale


In [109]:
non_name_dups = sanitized_data[~sanitized_data["name"].str.lower().isin(dup_names)]
first_pass_peppers = pd.concat([non_name_dups, prevailing_peppers]).sort_values("name").reset_index(drop=True)
raw_data.shape, first_pass_peppers.shape

((980, 11), (592, 11))

### Near-name dups

In [112]:
first_pass_peppers

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,link,source_name
0,7 Pot Barrackpore,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1300000,125.0,520.0,https://www.pepperscale.com/7-pot-barrackpore/,PepperScale
1,7 Pot Brain Strain,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1350000,125.0,540.0,https://www.pepperscale.com/7-pot-brain-strain/,PepperScale
2,7 Pot Bubblegum,chinense,super hot,Europe,United Kingdom,850000.0,1000000,106.0,400.0,https://www.pepperscale.com/7-pot-bubblegum/,PepperScale
3,7 Pot Chaguanas,,,,,,1000000,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
4,7 Pot Douglah,chinense,super hot,Central America and the Caribbean,Trinidad,923889.0,1853986,109.0,742.0,https://www.pepperscale.com/7-pot-douglah/,PepperScale
5,7 Pot Infinity,,,,,,1176182,,,https://www.cayennediane.com/peppers/7-pot-inf...,Cayenne Diane
6,7 Pot Jonah,chinense,super hot,Central America and the Caribbean,Trinidad,800000.0,1200000,100.0,480.0,https://www.pepperscale.com/7-pot-jonah/,PepperScale
7,7 Pot Madballz,,,,,,1000000,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
8,7 Pot Primo,chinense,super hot,North America,United States,800000.0,1268250,100.0,507.0,https://www.pepperscale.com/7-pot-primo/,PepperScale
9,7 Pot Rennie,,,,,,1000000,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane


In [None]:
# 200,000 - 350,000 SHU