In [1]:
import pandas as pd

In [2]:
cultivar_mappings={
    'apple':['apples', 'star apple', 'custard apple', 'sweetsops', 'pawpaws', 'crab apple', 'New Zealand nightshade', 'tree tomato', 'eggplants','mammee apple', 'white sapote', 'star apple'],
    'apricot':['peaches', 'Japenese apricot', 'bael', 'dates', 'glossy privet'],
    'berry':['persommons', 'strawberries', 'blackthorn','pomegranets', 'avocados', 'Chinese hawthorn', 
             'hawthorn', 'blueberries', 'American cranberry', 'goosberries', 'roselle','figs', 'mulberries', 
             'blackberries', 'red raspberries', 'brambles', 'purple raspberries','black raspberries', 'baobab', 'cloudberry', 
             'wineberry', 'raspberries' ],
    'cherry':['cherries', 'chokecherry', 'passfloria', 'Surinam cherry'],
    'citrus':['grapefruits', 'oranges','mandarins', 'tangerines', 'tangelos', 'satsumas', 'citrus fruits', 'citron', 
              'tangors', 'limes', 'lemons','tabog','limeberry',  'pumelos', 'nectarines', 'kumquats', 
              'Ogeechee tupelo', 'citranges', 'orangequat'],
    'grape': ['wampee', 'grapes','olives'],
    'nuts':['peanuts', 'hazelnuts', 'red mombins', 'tropical almond', 'walnuts', 'nutmeg', 'hickory', 'pekea nut', 
            'chestnuts', "St John's bread", 'cashew nuts', 'Italian stone pine', 'Japanese walnut', 'black walnut', 'English walnut', 
            'Eastern red cedar', 'Macadamia', 'pedans'],
    'peach':['loquats'],
    'pear':['prickly pear', 'chayote', 'Asian pear', 'Callery pear', 'Eastern shadbush'],
    'melon': ['watermelons', 'melons'],
    'quince': ['cherimoya', 'quinces'],
    'plum and prune':['plums', 'Japanese plum', 'apricot plum', 'Kaffir plum', 'cherry plum', 'Chickasaw plum', 'natal plum'],
    'tropical fruit': ['pineapples', 'guavas', 'Indian tamarind', 'mangoes', 'sapodillas', 'mangosteens', 
                       'doum palm', 'bananas', 'Acca', 'kiwifruit', 'yellow sapote', 'canistel', 'oak leaved papaya', 'papayas']
}

# Inverting cultivar mappings will improve performance

In [3]:
def invert_dict(d): 
    # Assumes no duplicate membership of common name to cultivar
    inverse = dict() 
    for key in d: 
        # Go through the list that is saved in the dict:
        for item in d[key]:
            # Check if in the inverted dict the key exists
            if item not in inverse: 
                # Add item to inverse. 
                inverse[item] = key 
    return inverse
inverted_cultivar_mappings = invert_dict(cultivar_mappings)
inverted_cultivar_mappings

{'apples': 'apple',
 'star apple': 'apple',
 'custard apple': 'apple',
 'sweetsops': 'apple',
 'pawpaws': 'apple',
 'crab apple': 'apple',
 'New Zealand nightshade': 'apple',
 'tree tomato': 'apple',
 'eggplants': 'apple',
 'mammee apple': 'apple',
 'white sapote': 'apple',
 'peaches': 'apricot',
 'Japenese apricot': 'apricot',
 'bael': 'apricot',
 'dates': 'apricot',
 'glossy privet': 'apricot',
 'persommons': 'berry',
 'strawberries': 'berry',
 'blackthorn': 'berry',
 'pomegranets': 'berry',
 'avocados': 'berry',
 'Chinese hawthorn': 'berry',
 'hawthorn': 'berry',
 'blueberries': 'berry',
 'American cranberry': 'berry',
 'goosberries': 'berry',
 'roselle': 'berry',
 'figs': 'berry',
 'mulberries': 'berry',
 'blackberries': 'berry',
 'red raspberries': 'berry',
 'brambles': 'berry',
 'purple raspberries': 'berry',
 'black raspberries': 'berry',
 'baobab': 'berry',
 'cloudberry': 'berry',
 'wineberry': 'berry',
 'raspberries': 'berry',
 'cherries': 'cherry',
 'chokecherry': 'cherry',
 

In [4]:
df = pd.read_csv('data/usda_pomological.csv')
df.head()

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note
0,POM00006406,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Duarte, Los Angeles County, California, United...",1 art original : col. ; 17 x 25 cm.,19473,,,,Navelencia,
1,POM00006407,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Riverside, Riverside County, California, Unite...",1 art original : col. ; 17 x 25 cm.,40440,1908.0,,1908,Navelencia,
2,POM00006463,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748,1914.0,,1914-03-13,New,
3,POM00006465,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748a,1914.0,Peter Bisset,1914-03-16,New,
4,POM00006446,"Schutt, Ellen Isham, 1873-1955",Citrus sinensis,oranges,,1 art original : col. ; 17 x 26 cm.,37438,1906.0,,1906-11-19,No. 779,Watercolor includes mock up for the Yearbook o...


# Create a column called cultivar with the cultivar name mapped from common name

In [5]:
df[['cultivar']] = df['common name'].map(invert_dict(cultivar_mappings))
df

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar
0,POM00006406,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Duarte, Los Angeles County, California, United...",1 art original : col. ; 17 x 25 cm.,19473,,,,Navelencia,,citrus
1,POM00006407,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Riverside, Riverside County, California, Unite...",1 art original : col. ; 17 x 25 cm.,40440,1908.0,,1908,Navelencia,,citrus
2,POM00006463,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748,1914.0,,1914-03-13,New,,citrus
3,POM00006465,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748a,1914.0,Peter Bisset,1914-03-16,New,,citrus
4,POM00006446,"Schutt, Ellen Isham, 1873-1955",Citrus sinensis,oranges,,1 art original : col. ; 17 x 26 cm.,37438,1906.0,,1906-11-19,No. 779,Watercolor includes mock up for the Yearbook o...,citrus
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7579,POM00001050,"Newton, Amanda Almira, ca. 1860-1943",Malus domestica,apples,"Storm Lake, Buena Vista County, Iowa, United S...",1 art original : col. ; 16 x 25 cm.,40064,1908.0,,1908-04-01,Anisim,,apple
7580,POM00001051,"Passmore, Deborah Griscom, 1840-1911",Malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 25 cm.,109640,1928.0,,1928-03-08,Annette,,apple
7581,POM00001052,"Heiges, Bertha",Malus domestica,apples,"Wilna, Harford County, Maryland, United States",1 art original : col. ; 17 x 25 cm.,33232,1905.0,,1905-01-25,Annie Frank,,apple
7582,POM00001053,"Arnold, Mary Daisy, ca. 1873-1955",Malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 26 cm.,105989,1925.0,"Section F, Row 1-2, Tree 3",1925-01-21,Annurco,,apple


In [6]:
df.to_csv('data/cultivar-pomo.csv', index=False)