In [15]:
import pandas as pd
import pdb
from fuzzywuzzy import fuzz
import difflib

In [2]:
raw_data = pd.read_csv("../data/peppers_20180105.csv")
raw_data.tail()

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,detail_link,source_link,source_name
1438,Common Pepper Spray,,,,,2000000.0,3000000,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff
1439,Police Grade Spray,,,,,,5300000,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff
1440,Homodihydrocapsaicin,,,,,,8600000,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff
1441,Nordihydrocapsaicin,,,,,,9100000,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff
1442,Pure Capsaicin,,,,,15000000.0,16000000,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff


# Deduplicating records
- If SHU values and name matches >70, considered duplicate
- Choose records with fewer NaN values

### Standardize names

In [3]:
sanitized_data = raw_data.copy()

In [4]:
# remove "Pepper" reference and strip whitespace
def remove_pepper_ref(name):
    if name != "Pepperoncini":
        name = name.replace("Peppers", "").replace("Pepper", "")
    return name.strip()

sanitized_data["name"] = sanitized_data["name"].apply(remove_pepper_ref)

### Low hanging fruit: pure name matches

In [5]:
pepper_names = sanitized_data["name"].str.lower().value_counts()
len(pepper_names[pepper_names > 1])/float(len(pepper_names))

0.7095238095238096

In [6]:
dup_names = pepper_names[pepper_names > 1].index
dup_name_records = sanitized_data[sanitized_data["name"].str.lower().isin(dup_names)].sort_values("name")
dup_name_records

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,detail_link,source_link,source_name
147,7 Pot Barrackpore,,,,,,1300000,,,http://pepperheadsforlife.com/7-pot-barrackpore/,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
124,7 Pot Barrackpore,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1300000,125.0,520.0,https://www.pepperscale.com/7-pot-barrackpore/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
605,7 Pot Barrackpore,,,,,,1300000,,,https://www.cayennediane.com/peppers/7-pot-bar...,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
123,7 Pot Brain Strain,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1350000,125.0,540.0,https://www.pepperscale.com/7-pot-brain-strain/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
604,7 Pot Brain Strain,,,,,,1350000,,,https://www.cayennediane.com/peppers/7-pot-bra...,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
160,7 Pot Brain Strain,,,,,,1000000,,,http://pepperheadsforlife.com/the-scoville-sca...,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
615,7 Pot Bubblegum,,,,,,1000000,,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
116,7 Pot Bubblegum,chinense,super hot,Europe,United Kingdom,850000.0,1000000,106.0,400.0,https://www.pepperscale.com/7-pot-bubblegum/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
144,7 Pot Douglah,,,,,,1853396,,,http://pepperheadsforlife.com/the-scoville-sca...,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
119,7 Pot Douglah,chinense,super hot,Central America and the Caribbean,Trinidad,923889.0,1853986,109.0,742.0,https://www.pepperscale.com/7-pot-douglah/,https://www.pepperscale.com/hot-pepper-list/,PepperScale


In [7]:
def choose_duplicate(duplicates):
    nan_count = duplicates.apply(lambda row: sum(pd.isnull(row)), axis=1)
    fuller_row_index = nan_count.sort_values().index[0]
    return duplicates.loc[fuller_row_index]

prevailing_peppers = pd.DataFrame([choose_duplicate(sanitized_data[sanitized_data["name"].str.lower() == pepper])\
                                   for pepper in dup_names])
prevailing_peppers

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,detail_link,source_link,source_name
68,Manzano,pubescens,medium,South America,South America,12000.0,30000,2.0,12.0,http://www.pepperscale.com/manzano-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
212,Bahamian,,,,,,110000,,,http://pepperheadsforlife.com/scoville-units-b...,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
90,Thai,annuum,extra hot,Asia,Thailand,50000.0,100000,6.0,40.0,http://www.pepperscale.com/thai-peppers,https://www.pepperscale.com/hot-pepper-list/,PepperScale
23,Anaheim,annuum,mild,North America,Mexico,500.0,2500,-16.0,0.0,http://www.pepperscale.com/anaheim-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
128,Carolina Reaper,chinense,super hot,North America,United States,1400000.0,2200000,175.0,880.0,http://www.pepperscale.com/carolina-reaper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
24,Santa Fe Grande,annuum,mild,North America,United States,500.0,700,-3.0,-16.0,https://www.pepperscale.com/santa-fe-grande-pe...,https://www.pepperscale.com/hot-pepper-list/,PepperScale
17,Cubanelle,annuum,mild,Multi-Region,"Italy, Cuba",100.0,1000,-80.0,-3.0,http://www.pepperscale.com/cubanelle-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
44,Jalapeño,annuum,medium,North America,Mexico,2500.0,8000,0.0,0.0,http://www.pepperscale.com/jalapeno-peppers,https://www.pepperscale.com/hot-pepper-list/,PepperScale
100,Scotch Bonnet,chinense,extra hot,Central America and the Caribbean,Caribbean,100000.0,350000,12.0,140.0,http://www.pepperscale.com/scotch-bonnet-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
28,Ancho,annuum,mild,North America,Mexico,1000.0,1500,-8.0,-2.0,http://www.pepperscale.com/ancho-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale


In [8]:
non_name_dups = sanitized_data[~sanitized_data["name"].str.lower().isin(dup_names)]
first_pass_peppers = pd.concat([non_name_dups, prevailing_peppers]).sort_values("name").reset_index(drop=True)
raw_data.shape, first_pass_peppers.shape

((1443, 12), (630, 12))

### Near-name dups

Use fuzzywuzzy and data points to determine close matches

In [10]:
first_pass_peppers[first_pass_peppers["name"].str.contains("7 Pot")]

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,detail_link,source_link,source_name
0,7 Pot Barrackpore,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1300000,125.0,520.0,https://www.pepperscale.com/7-pot-barrackpore/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
1,7 Pot Brain Strain,chinense,super hot,Central America and the Caribbean,Trinidad,1000000.0,1350000,125.0,540.0,https://www.pepperscale.com/7-pot-brain-strain/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
2,7 Pot Bubblegum,chinense,super hot,Europe,United Kingdom,850000.0,1000000,106.0,400.0,https://www.pepperscale.com/7-pot-bubblegum/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
3,7 Pot Chaguanas,,,,,,1000000,,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
4,7 Pot Douglah,chinense,super hot,Central America and the Caribbean,Trinidad,923889.0,1853986,109.0,742.0,https://www.pepperscale.com/7-pot-douglah/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
5,7 Pot Infinity,,,,,,1176182,,,https://www.cayennediane.com/peppers/7-pot-inf...,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane
6,7 Pot Jonah,chinense,super hot,Central America and the Caribbean,Trinidad,800000.0,1200000,100.0,480.0,https://www.pepperscale.com/7-pot-jonah/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
7,7 Pot Madballz,,,,,,1066882,,,http://pepperheadsforlife.com/the-scoville-sca...,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
8,7 Pot Primo,chinense,super hot,North America,United States,800000.0,1268250,100.0,507.0,https://www.pepperscale.com/7-pot-primo/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
9,7 Pot Rennie,,,,,,1000000,,,,https://www.cayennediane.com/the-scoville-scale/,Cayenne Diane


In [14]:
fuzz.ratio("Trinidad 7 Pot", "Trinidad 7 Pot/Pod")

88

In [28]:
similarities = set()

for pepper_name in first_pass_peppers["name"].unique():
    pepper_similarities = difflib.get_close_matches(pepper_name, first_pass_peppers["name"].unique(), cutoff=.8)
    if len(pepper_similarities) > 1:
        similarities.add(tuple(sorted(pepper_similarities))) # to ensure that only one copy gets added

In [29]:
similarities

{('African Birds Eye', 'African Birds Eye Chili'),
 ('Aji Benito', 'Aji Bento'),
 ('Aji Charapa', 'Aji Charapita'),
 ('Aji Dulce', 'Ají Dulce'),
 ('Aji Escabeche', 'Escabeche'),
 ('Aji Omnicolor', 'Ají Omnicolor'),
 ('Ají Habanero', 'Habanero'),
 ('Ají Lemon Drop', 'Ají Limo (Lemon Drop)'),
 ('Ají Lemon Drop', 'Ají Limo (Lemon Drop)', 'Lemon Drop'),
 ('Ají Lemon Drop', 'Lemon Drop'),
 ('All Sweet Bells', 'Sweet Bell'),
 ('Alma Paprika', 'Alma Spicy Paprika'),
 ('Alma Paprika', 'Alma Spicy Paprika', 'Almapaprika'),
 ('Alma Paprika', 'Almapaprika'),
 ("Barker's Hot", 'Barker’s Hot'),
 ("Barker's Hot", 'Barker’s Hot', "NuMex Barker's Hot"),
 ("Barker's Hot", "NuMex Barker's Hot"),
 ('Bedfordshire Super Naga', 'Bedfordshire Super Naga Chili'),
 ('Bhut Jolokia', 'Naga Bhut Jolokia'),
 ('Bhut Jolokia', 'Naga Bhut Jolokia', 'Naga Jolokia'),
 ('Bishop&#039;s Crown', 'Bishops Crown'),
 ("Bishop's Crown", 'Bishops Crown'),
 ("Bishop's Crown", 'Bishops Crown', 'Bishops Crown Chili'),
 ('Bishops C

In [36]:
source_ranking = {
    "PepperScale": 1, "Pepperheads": 2, "ChiliWorld": 3, "Cayenne Diane": 4, "Uncle Steve's Hot Stuff": 5
}

In [47]:
def determine_fuzzy_duplicate(peppers):
    if len(peppers["max_shu"].unique()) == 1:
        sorting_data = tuple(zip(peppers.index, peppers["source_name"].apply(lambda source: source_ranking[source])))
        winner = sorted(sorting_data, key=lambda x: x[1])[0][0]
        return peppers.loc[winner]
    else:
        pdb.set_trace()
        
second_pass = []
for group in list(similarities):
    group_records = first_pass_peppers[first_pass_peppers["name"].isin(group)]
    second_pass.append(determine_fuzzy_duplicate(group_records))

--Return--
> <ipython-input-47-b09901525a0c>(7)determine_fuzzy_duplicate()->None
-> pdb.set_trace()
(Pdb) peppers
                   name species heat region origin  min_shu  max_shu  min_jrp  \
82         Barker's Hot     NaN  NaN    NaN    NaN    500.0     7000      NaN   
443  NuMex Barker's Hot     NaN  NaN    NaN    NaN  15000.0    30000      NaN   

     max_jrp detail_link                           source_link  \
82       NaN         NaN  http://ushotstuff.com/Heat.Scale.htm   
443      NaN         NaN  http://ushotstuff.com/Heat.Scale.htm   

                 source_name  
82   Uncle Steve's Hot Stuff  
443  Uncle Steve's Hot Stuff  
(Pdb) c
--Return--
> <ipython-input-47-b09901525a0c>(7)determine_fuzzy_duplicate()->None
-> pdb.set_trace()
(Pdb) peppers
              name species heat region origin  min_shu  max_shu  min_jrp  \
39   Aji Escabeche     NaN  NaN    NaN    NaN  12000.0    17000      NaN   
254      Escabeche     NaN  NaN    NaN    NaN    500.0     1000      NaN   


BdbQuit: 

In [34]:
first_pass_peppers[first_pass_peppers["name"].str.contains("Bell")]

Unnamed: 0,name,species,heat,region,origin,min_shu,max_shu,min_jrp,max_jrp,detail_link,source_link,source_name
57,All Sweet Bells,,,,,,0,,,,http://ushotstuff.com/Heat.Scale.htm,Uncle Steve's Hot Stuff
88,Bell,annuum,mild,Multi-Region,"Mexico, South America",0.0,0,-8000.0,-2500.0,http://www.pepperscale.com/bell-pepper,https://www.pepperscale.com/hot-pepper-list/,PepperScale
131,Cajun Belle,annuum,medium,North America,United States,500.0,4000,-5.0,0.0,https://www.pepperscale.com/cajun-belle-pepper/,https://www.pepperscale.com/hot-pepper-list/,PepperScale
369,Jingle Bells,,,,,,0,,,,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
414,Mexi Bell,,,,,,500,,,,https://pepperheadsforlife.com/the-scoville-sc...,Pepperheads
578,Sweet Bell,,,,,,0,,,,https://www.chilliworld.com/factfile/scoville-...,ChiliWorld
