# For crops with a geographic origin of the United States, is the polity (as a geographic origin) with the most specimens of a taxon also the polity with the highest number exports for that taxon in 2019?


### Let's grab the polity with the highest number of exports per taxon first.

In [1]:
import pandas as pd

In [2]:
exports_df = pd.read_csv('data/USA_2019_Fruit_Totals.csv', index_col=0)
exports_df.head()

Unnamed: 0,Oranges,Grapefruit,Lemons,mandarins,citrus fruit,Apples,Apricots,Avocados,Bananas,Blackberries,...,Strawberries,noncitrus fruit,Almonds,Hazelnuts,Macadamia,Pecans,Pistachios,Walnuts,tree nuts,fruit nuts
Alabama,0,0,0,0,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,42680,0,42680,0,0,0,0.0,0.0,...,0,86279,0,0,0,68229,0,0,68229,197188
Arkansas,0,0,0,0,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
California,699458,56552,688163,679638,2123811,72911,40444,373185,0.0,0.0,...,2221320,9990724,6094440,0,0,0,1938800,1286410,9319650,21434185
Colorado,0,0,0,0,0,0,0,0,0.0,0.0,...,0,30647,0,0,0,0,0,0,0,30647


In [3]:
exports_df.columns

Index(['Oranges', 'Grapefruit ', 'Lemons', 'mandarins ', 'citrus fruit',
       'Apples', 'Apricots', 'Avocados', 'Bananas', 'Blackberries',
       'Blueberries', 'Boysenberries', 'Raspberries', 'cherries', 'cherries.1',
       'Cranberries', 'Dates', 'Figs', 'Grapes', 'Kiwifruit', 'Nectarines',
       'Olives', 'Papayas', 'Peaches', 'Pears', 'Plums', 'Strawberries',
       'noncitrus fruit', 'Almonds', 'Hazelnuts', 'Macadamia', 'Pecans',
       'Pistachios', 'Walnuts', 'tree nuts', 'fruit nuts'],
      dtype='object')

# Remove cherries.1 for simplicity

In [4]:
del exports_df['cherries.1']

# The values look like they are strings. I'll need to convert them.

In [5]:
for column in exports_df.columns:
    if exports_df[column].dtype != float:
        exports_df[column] = exports_df[column].str.replace(',', '').astype(float)
all(exports_df.dtypes.values == float)

True

In [6]:
exports_df.head()

Unnamed: 0,Oranges,Grapefruit,Lemons,mandarins,citrus fruit,Apples,Apricots,Avocados,Bananas,Blackberries,...,Strawberries,noncitrus fruit,Almonds,Hazelnuts,Macadamia,Pecans,Pistachios,Walnuts,tree nuts,fruit nuts
Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arizona,0.0,0.0,42680.0,0.0,42680.0,0.0,0.0,0.0,0.0,0.0,...,0.0,86279.0,0.0,0.0,0.0,68229.0,0.0,0.0,68229.0,197188.0
Arkansas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
California,699458.0,56552.0,688163.0,679638.0,2123811.0,72911.0,40444.0,373185.0,0.0,0.0,...,2221320.0,9990724.0,6094440.0,0.0,0.0,0.0,1938800.0,1286410.0,9319650.0,21434185.0
Colorado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30647.0


# We should make case the same for column names

In [7]:
exports_df.rename(columns=dict(zip(exports_df.columns, [column.lower() for column in exports_df.columns])), inplace=True)
exports_df.head()

Unnamed: 0,oranges,grapefruit,lemons,mandarins,citrus fruit,apples,apricots,avocados,bananas,blackberries,...,strawberries,noncitrus fruit,almonds,hazelnuts,macadamia,pecans,pistachios,walnuts,tree nuts,fruit nuts
Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Arizona,0.0,0.0,42680.0,0.0,42680.0,0.0,0.0,0.0,0.0,0.0,...,0.0,86279.0,0.0,0.0,0.0,68229.0,0.0,0.0,68229.0,197188.0
Arkansas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
California,699458.0,56552.0,688163.0,679638.0,2123811.0,72911.0,40444.0,373185.0,0.0,0.0,...,2221320.0,9990724.0,6094440.0,0.0,0.0,0.0,1938800.0,1286410.0,9319650.0,21434185.0
Colorado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30647.0


In [8]:
taxon_highest_polity_mapping = dict(exports_df.idxmax())
taxon_highest_polity_mapping

{'oranges': ' Florida ',
 'grapefruit ': ' Florida ',
 'lemons': ' California ',
 'mandarins ': ' California ',
 'citrus fruit': ' California ',
 'apples': ' Washington ',
 'apricots': ' California ',
 'avocados': ' California ',
 'bananas': ' Alabama ',
 'blackberries': ' Alabama ',
 'blueberries': ' California ',
 'boysenberries': ' Alabama ',
 'raspberries': ' California ',
 'cherries': ' Washington ',
 'cranberries': ' Wisconsin ',
 'dates': ' California ',
 'figs': ' Alabama ',
 'grapes': ' California ',
 'kiwifruit': ' California ',
 'nectarines': ' California ',
 'olives': ' California ',
 'papayas': ' Hawaii ',
 'peaches': ' California ',
 'pears': ' Washington ',
 'plums': ' California ',
 'strawberries': ' California ',
 'noncitrus fruit': ' California ',
 'almonds': ' California ',
 'hazelnuts': ' Oregon ',
 'macadamia': ' Hawaii ',
 'pecans': ' New Mexico ',
 'pistachios': ' California ',
 'walnuts': ' California ',
 'tree nuts': ' California ',
 'fruit nuts': ' California 

# An inversion would be useful to have.

In [9]:
# Based on https://stackoverflow.com/a/35491326
def invert_dict(d): 
    inverse = dict() 
    for key in d: 
            # Check if in the inverted dict the key exists
        if d[key] not in inverse: 
                # If not create a new list
            inverse[d[key]] = [key] 
        else: 
            inverse[d[key]].append(key) 
    return inverse
polity_highest_taxon_mapping = invert_dict(taxon_highest_polity_mapping)
polity_highest_taxon_mapping

{' Florida ': ['oranges', 'grapefruit '],
 ' California ': ['lemons',
  'mandarins ',
  'citrus fruit',
  'apricots',
  'avocados',
  'blueberries',
  'raspberries',
  'dates',
  'grapes',
  'kiwifruit',
  'nectarines',
  'olives',
  'peaches',
  'plums',
  'strawberries',
  'noncitrus fruit',
  'almonds',
  'pistachios',
  'walnuts',
  'tree nuts',
  'fruit nuts'],
 ' Washington ': ['apples', 'cherries', 'pears'],
 ' Alabama ': ['bananas', 'blackberries', 'boysenberries', 'figs'],
 ' Wisconsin ': ['cranberries'],
 ' Hawaii ': ['papayas', 'macadamia'],
 ' Oregon ': ['hazelnuts'],
 ' New Mexico ': ['pecans']}

# Remove whitespace from state names

In [10]:
taxon_highest_polity_mapping = dict(zip(taxon_highest_polity_mapping.keys(),[value.strip(' ') for value in taxon_highest_polity_mapping.values()]))

# Finally, we are ready for the pomological dataset.

In [11]:
pomo_df = pd.read_csv('data/cultivar-pomo-usda.csv')
pomo_df.head()

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar,is_usda_recognized
0,POM00006406,"Passmore, Deborah Griscom, 1840-1911",citrus sinensis,oranges,"Duarte, Los Angeles County, California, United...",1 art original : col. ; 17 x 25 cm.,19473,,,,Navelencia,,citrus,True
1,POM00006407,"Passmore, Deborah Griscom, 1840-1911",citrus sinensis,oranges,"Riverside, Riverside County, California, Unite...",1 art original : col. ; 17 x 25 cm.,40440,1908.0,,1908,Navelencia,,citrus,True
2,POM00006463,"Newton, Amanda Almira, ca. 1860-1943",citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748,1914.0,,1914-03-13,New,,citrus,True
3,POM00006465,"Newton, Amanda Almira, ca. 1860-1943",citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748a,1914.0,Peter Bisset,1914-03-16,New,,citrus,True
4,POM00006446,"Schutt, Ellen Isham, 1873-1955",citrus sinensis,oranges,,1 art original : col. ; 17 x 26 cm.,37438,1906.0,,1906-11-19,No. 779,Watercolor includes mock up for the Yearbook o...,citrus,True


In [12]:
print(len(pomo_df))
pomo_df[pomo_df['geographic origin'].str.contains('United States', na=False)]

7584


Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar,is_usda_recognized
0,POM00006406,"Passmore, Deborah Griscom, 1840-1911",citrus sinensis,oranges,"Duarte, Los Angeles County, California, United...",1 art original : col. ; 17 x 25 cm.,19473,,,,Navelencia,,citrus,True
1,POM00006407,"Passmore, Deborah Griscom, 1840-1911",citrus sinensis,oranges,"Riverside, Riverside County, California, Unite...",1 art original : col. ; 17 x 25 cm.,40440,1908.0,,1908,Navelencia,,citrus,True
2,POM00006463,"Newton, Amanda Almira, ca. 1860-1943",citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748,1914.0,,1914-03-13,New,,citrus,True
3,POM00006465,"Newton, Amanda Almira, ca. 1860-1943",citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748a,1914.0,Peter Bisset,1914-03-16,New,,citrus,True
5,POM00006467,"Passmore, Deborah Griscom, 1840-1911",citrus sinensis,oranges,"Conant, Lake County, Florida, United States",1 art original : col. ; 17 x 24 cm.,6429,1894.0,,1894,Nonpareil,,citrus,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7579,POM00001050,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Storm Lake, Buena Vista County, Iowa, United S...",1 art original : col. ; 16 x 25 cm.,40064,1908.0,,1908-04-01,Anisim,,apple,True
7580,POM00001051,"Passmore, Deborah Griscom, 1840-1911",malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 25 cm.,109640,1928.0,,1928-03-08,Annette,,apple,True
7581,POM00001052,"Heiges, Bertha",malus domestica,apples,"Wilna, Harford County, Maryland, United States",1 art original : col. ; 17 x 25 cm.,33232,1905.0,,1905-01-25,Annie Frank,,apple,True
7582,POM00001053,"Arnold, Mary Daisy, ca. 1873-1955",malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 26 cm.,105989,1925.0,"Section F, Row 1-2, Tree 3",1925-01-21,Annurco,,apple,True


In [13]:
pomo_df['geographic_origin'] = pomo_df['geographic origin']
pomo_df['common_name'] = pomo_df['common name']

# Of the 7584 rows, 6775 of them contain a pomological artifact with a geographic origin within the United States

In [14]:
def is_polity_largest_taxon_produer(row,mapping):
    for key in mapping:
        if isinstance(row.geographic_origin, str) and key in row.geographic_origin:
            if row.common_name in mapping[key]:
                return True
    return False
pomo_df['is_originpolity_largest_producer'] = pomo_df.apply(lambda x: is_polity_largest_taxon_produer(x, polity_highest_taxon_mapping), axis=1)
pomo_df[pomo_df['is_originpolity_largest_producer'] == True]

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar,is_usda_recognized,geographic_origin,common_name,is_originpolity_largest_producer
78,POM00004277,"Arnold, Mary Daisy, ca. 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,[00052],,,,Pedro,Assigned specimen number,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
80,POM00001717,"Schutt, Ellen Isham, 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,43058,1909.0,,1909-01-27,Brooke Blushed,,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
319,POM00003596,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Smithsburg, Washington County, Maryland, Unite...",1 art original : col. ; 17 x 25 cm.,33289,1905.0,,1905-04-08,Towson,,apple,True,"Smithsburg, Washington County, Maryland, Unite...",apples,True
321,POM00001366,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854a,1915.0,,1915-01-28,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
323,POM00001566,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854,1915.0,,1915-01-26,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6452,POM00000983,"Passmore, Deborah Griscom, 1840-1911",malus domestica,apples,"Fayetteville, Washington County, Arkansas, Uni...",1 art original : col. ; 17 x 25 cm.,17660,1899.0,,1899-08-12,Arkansas Red,Alternative variety name(s): August Red,apple,True,"Fayetteville, Washington County, Arkansas, Uni...",apples,True
6483,POM00003286,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Marietta, Washington County, Ohio, United States",1 art original : col. ; 17 x 25 cm.,78722,1915.0,,1915-01-06,Rome Beauty Sport,,apple,True,"Marietta, Washington County, Ohio, United States",apples,True
6889,POM00006899,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302,1897.0,,1897-09-03,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True
6890,POM00006900,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302a,1897.0,,1897-09-04,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True


In [15]:
pomo_df['is_originpolity_largest_producer'] = pomo_df.apply(lambda x: is_polity_largest_taxon_produer(x, polity_highest_taxon_mapping), axis=1)
pomo_df[pomo_df['is_originpolity_largest_producer'] == True]

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar,is_usda_recognized,geographic_origin,common_name,is_originpolity_largest_producer
78,POM00004277,"Arnold, Mary Daisy, ca. 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,[00052],,,,Pedro,Assigned specimen number,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
80,POM00001717,"Schutt, Ellen Isham, 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,43058,1909.0,,1909-01-27,Brooke Blushed,,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
319,POM00003596,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Smithsburg, Washington County, Maryland, Unite...",1 art original : col. ; 17 x 25 cm.,33289,1905.0,,1905-04-08,Towson,,apple,True,"Smithsburg, Washington County, Maryland, Unite...",apples,True
321,POM00001366,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854a,1915.0,,1915-01-28,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
323,POM00001566,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854,1915.0,,1915-01-26,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6452,POM00000983,"Passmore, Deborah Griscom, 1840-1911",malus domestica,apples,"Fayetteville, Washington County, Arkansas, Uni...",1 art original : col. ; 17 x 25 cm.,17660,1899.0,,1899-08-12,Arkansas Red,Alternative variety name(s): August Red,apple,True,"Fayetteville, Washington County, Arkansas, Uni...",apples,True
6483,POM00003286,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Marietta, Washington County, Ohio, United States",1 art original : col. ; 17 x 25 cm.,78722,1915.0,,1915-01-06,Rome Beauty Sport,,apple,True,"Marietta, Washington County, Ohio, United States",apples,True
6889,POM00006899,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302,1897.0,,1897-09-03,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True
6890,POM00006900,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302a,1897.0,,1897-09-04,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True


# Seems a little small for such a large dataset...

In [16]:
def is_polity_largest_taxon_produer(row,mapping):
    for key in mapping:
        if isinstance(row.geographic_origin, str) and key in row.geographic_origin:
            if row.common_name in mapping[key] or row.cultivar in mapping[key]:
                return True
    return False
pomo_df['is_originpolity_largest_producer'] = pomo_df.apply(lambda x: is_polity_largest_taxon_produer(x, polity_highest_taxon_mapping), axis=1)
pomo_df[pomo_df['is_originpolity_largest_producer'] == True]

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,variety,nal note,cultivar,is_usda_recognized,geographic_origin,common_name,is_originpolity_largest_producer
78,POM00004277,"Arnold, Mary Daisy, ca. 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,[00052],,,,Pedro,Assigned specimen number,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
80,POM00001717,"Schutt, Ellen Isham, 1873-1955",malus domestica,apples,"Hancock, Washington County, Maryland, United S...",1 art original : col. ; 17 x 25 cm.,43058,1909.0,,1909-01-27,Brooke Blushed,,apple,True,"Hancock, Washington County, Maryland, United S...",apples,True
319,POM00003596,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Smithsburg, Washington County, Maryland, Unite...",1 art original : col. ; 17 x 25 cm.,33289,1905.0,,1905-04-08,Towson,,apple,True,"Smithsburg, Washington County, Maryland, Unite...",apples,True
321,POM00001366,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854a,1915.0,,1915-01-28,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
323,POM00001566,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Millbridge, Washington County, Maine, United S...",1 art original : col. ; 17 x 25 cm.,79854,1915.0,,1915-01-26,Canada Reinette,,apple,True,"Millbridge, Washington County, Maine, United S...",apples,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6452,POM00000983,"Passmore, Deborah Griscom, 1840-1911",malus domestica,apples,"Fayetteville, Washington County, Arkansas, Uni...",1 art original : col. ; 17 x 25 cm.,17660,1899.0,,1899-08-12,Arkansas Red,Alternative variety name(s): August Red,apple,True,"Fayetteville, Washington County, Arkansas, Uni...",apples,True
6483,POM00003286,"Newton, Amanda Almira, ca. 1860-1943",malus domestica,apples,"Marietta, Washington County, Ohio, United States",1 art original : col. ; 17 x 25 cm.,78722,1915.0,,1915-01-06,Rome Beauty Sport,,apple,True,"Marietta, Washington County, Ohio, United States",apples,True
6889,POM00006899,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302,1897.0,,1897-09-03,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True
6890,POM00006900,"Heiges, Bertha",pyrus communis,pears,"Weiser, Washington County, Idaho, United States",1 art original : col. ; 17 x 24 cm.,14302a,1897.0,,1897-09-04,Bartlett,,,True,"Weiser, Washington County, Idaho, United States",pears,True


#  ¯\_(ツ)_/¯

In [17]:
pomo_df.to_csv('data/polity_cultivar_pomo_usda.csv', index=False)