In [27]:
# The goal of this notebook will be to clean the 'coffee_origin' data in order to focus on 
# countries of origin for sourcing our coffee beans.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

coffee_origins = pd.read_csv('clean_coffee.csv', index_col=False)
coffee_origins = coffee_origins.drop('Unnamed: 0', axis=1)
coffee_origins.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,"Deeply sweet-tart, chocolate-toned. Dark choco...","San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [28]:
# It looks like the 'coffee_origin' column tends to have the country name as the last item listed.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.split(', ')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str[-1]
coffee_origins['coffee_origin'].head(15)


0                   Bolivia
1                  Ethiopia
2         southern Ethiopia
3                  Colombia
4                 Guatemala
5                  Ethiopia
6                 Guatemala
7                  Colombia
8          Papua New Guinea
9         southern Ethiopia
10     northern Philippines
11                 Colombia
12                 Colombia
13        southern Ethiopia
14    Big Island of Hawai’i
Name: coffee_origin, dtype: object

In [29]:
coffee_origins['coffee_origin'].unique()
# Whew! That's still a lot! Time to clean it up.


array(['Bolivia', 'Ethiopia', 'southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'northern Philippines',
       'Big Island of Hawai’i', 'Brazil', 'Peru', 'Ethiopia; Sumatra',
       'El Salvador', 'Southern Ethiopia', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic of the Congo', 'Costa Rica', 'Ecuador',
       'southwest Vietnam', 'Indonesia', 'central El Salvador',
       'Nicaragua', '“Big Island” of Hawai’i', 'Mexico', 'Rwanda',
       'Haiti', 'Dominican Republic; Honduras', 'Vietnam',
       'Thailand; Vietnam; Latin America', 'Not disclosed',
       'south-central Kenya', 'northern Nicaragua', 'Burundi',
       'Guatemala; Nicaragua; Ethiopia; Costa Rica; Indonesia; Brazil',
       'Colombia; Ethiopia', 'Honduras', 'Colombia; Guatemala; Mexico',
       'Guatemala; Ethiopia; Colombia', 'south-central Ethiopia',
       'western Panama', 'Philippines', 'southern Colombia',
       'Colombia; Guatemala; Costa Rica; Ethiopia', 'Yemen',
       'Huila C

In [30]:
# I'll start with stripping any whitespace and putting everything in title case.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.strip().str.title()
coffee_origins['coffee_origin'].unique()



array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', 'Ethiopia; Sumatra',
       'El Salvador', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic Of The Congo', 'Costa Rica', 'Ecuador',
       'Southwest Vietnam', 'Indonesia', 'Central El Salvador',
       'Nicaragua', '“Big Island” Of Hawai’I', 'Mexico', 'Rwanda',
       'Haiti', 'Dominican Republic; Honduras', 'Vietnam',
       'Thailand; Vietnam; Latin America', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi',
       'Guatemala; Nicaragua; Ethiopia; Costa Rica; Indonesia; Brazil',
       'Colombia; Ethiopia', 'Honduras', 'Colombia; Guatemala; Mexico',
       'Guatemala; Ethiopia; Colombia', 'South-Central Ethiopia',
       'Western Panama', 'Philippines', 'Southern Colombia',
       'Colombia; Guatemala; Costa Rica; Ethiopia', 'Yemen',
       'Huila Colombia', 'Southern',

In [31]:
# It looks like many of these are blends of beans from several different regions, separated
# by a ';'. I'm not going to include those in my analysis, so I'll filter them out.
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains(';', na=False))]
coffee_origins['coffee_origin'].unique()


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', 'El Salvador',
       'Tanzania', 'Kenya', 'Panama', 'Democratic Republic Of The Congo',
       'Costa Rica', 'Ecuador', 'Southwest Vietnam', 'Indonesia',
       'Central El Salvador', 'Nicaragua', '“Big Island” Of Hawai’I',
       'Mexico', 'Rwanda', 'Haiti', 'Vietnam', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi', 'Honduras',
       'South-Central Ethiopia', 'Western Panama', 'Philippines',
       'Southern Colombia', 'Yemen', 'Huila Colombia', 'Southern',
       'Uganda', 'Africa', 'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Hawaii', 'Big Island Of Hawai‘I',
       'Central America', 'East Malaysia', 'The Philippines', 'Zambia',
       'Southwestern Tanzania', 'Thailand', 'Northern Tanzania',
       

In [32]:
# Many of them have a trailing '.'
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('.', '')
coffee_origins['coffee_origin'].unique()


  coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('.', '')


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', 'El Salvador',
       'Tanzania', 'Kenya', 'Panama', 'Democratic Republic Of The Congo',
       'Costa Rica', 'Ecuador', 'Southwest Vietnam', 'Indonesia',
       'Central El Salvador', 'Nicaragua', '“Big Island” Of Hawai’I',
       'Mexico', 'Rwanda', 'Haiti', 'Vietnam', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi', 'Honduras',
       'South-Central Ethiopia', 'Western Panama', 'Philippines',
       'Southern Colombia', 'Yemen', 'Huila Colombia', 'Southern',
       'Uganda', 'Africa', 'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Hawaii', 'Big Island Of Hawai‘I',
       'Central America', 'East Malaysia', 'The Philippines', 'Zambia',
       'Southwestern Tanzania', 'Thailand', 'Northern Tanzania',
       

In [33]:
# Lots of these are hawaii...
coffee_hawaii = coffee_origins.loc[coffee_origins['coffee_origin'].str.contains('Hawai', na=False)]
coffee_hawaii['coffee_origin'].unique().tolist()


['Big Island Of Hawai’I',
 '“Big Island” Of Hawai’I',
 'Hawaii',
 'Big Island Of Hawai‘I',
 "“Big Island” Of Hawai'I",
 "Big Island Of Hawai'I",
 '“Big Island” Of Hawaii',
 'Big Island Of Hawaii',
 'Hawai’I',
 '"Big Island" Of Hawai’I',
 "Hawai'I",
 '"Big Island" Of Hawaii',
 'Hawaiian Islands',
 'Southwestern Corner Of The "Big Island" Of Hawaii']

In [34]:
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains('Hawai', na=False))]
coffee_origins['coffee_origin'].unique()
# Great! We're getting to the point now where I should start removing the 'Southern's and 'West-Central's and all of that.


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines', 'Brazil',
       'Peru', 'El Salvador', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic Of The Congo', 'Costa Rica', 'Ecuador',
       'Southwest Vietnam', 'Indonesia', 'Central El Salvador',
       'Nicaragua', 'Mexico', 'Rwanda', 'Haiti', 'Vietnam',
       'Not Disclosed', 'South-Central Kenya', 'Northern Nicaragua',
       'Burundi', 'Honduras', 'South-Central Ethiopia', 'Western Panama',
       'Philippines', 'Southern Colombia', 'Yemen', 'Huila Colombia',
       'Southern', 'Uganda', 'Africa',
       'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Central America', 'East Malaysia',
       'The Philippines', 'Zambia', 'Southwestern Tanzania', 'Thailand',
       'Northern Tanzania', 'Dominican Republic', 'Sumatra',
       'Central Kenya', 'South-Central Guatemala', 'Western R

In [35]:
# Let's look at all of the entries that contain 'South'.
south = coffee_origins[coffee_origins['coffee_origin'].str.contains('South', na=False)]
south['coffee_origin'].unique().tolist()


['Southern Ethiopia',
 'Southwest Vietnam',
 'South-Central Kenya',
 'South-Central Ethiopia',
 'Southern Colombia',
 'Southern',
 'Southwestern Tanzania',
 'South-Central Guatemala',
 'Southern Rwanda',
 'Southwestern Kenya',
 'Southwestern Ethiopia',
 'Southwestern Colombia',
 'South Africa',
 'Southeastern Ethiopia',
 'South America',
 'South-Eastern Ecuador',
 'Southern Ecuador',
 'Southeastern Brazil',
 'Southeastern El Salvador',
 'Southern Costa Rica',
 'South-Central Rwanda',
 'Southwestern Colombianar',
 'Southwest Ethiopia',
 'Southwest Uganda',
 'Southern Province,Rwanda',
 'South-Central Kenyamur',
 'Southern India',
 'South-Central Colombia',
 'Southwestern Mexico',
 'Southern And Western Ethiopia',
 'Southern Laos',
 'Souther Ethiopia',
 'Souther Ecuador',
 'South-Central Brazil',
 'Southern Tanzania',
 'Southern Peru',
 'Southeastern Mexico',
 'Southeastern Peru',
 'Southern Malawi',
 'South Central Guatemala',
 'Southwestern Guatemala',
 'Central And South America',
 'S

In [36]:
# Since only a few countries have two-word names, I'm going to temporarily convert them to one-word names
# until the end of this notebook.
double_dictionary = {'El Salvador': 'ElSalvador', 'Costa Rica': 'CostaRica', 'Papua New Guinea' : 'PapuaNewGuinea', 'Democratic Republic Of The Congo' : 'Congo', 'South Africa' : 'SouthAfrica'}

for key in double_dictionary.keys():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(key, double_dictionary[key])

coffee_origins['coffee_origin'].unique()


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'PapuaNewGuinea', 'Northern Philippines', 'Brazil',
       'Peru', 'ElSalvador', 'Tanzania', 'Kenya', 'Panama', 'Congo',
       'CostaRica', 'Ecuador', 'Southwest Vietnam', 'Indonesia',
       'Central ElSalvador', 'Nicaragua', 'Mexico', 'Rwanda', 'Haiti',
       'Vietnam', 'Not Disclosed', 'South-Central Kenya',
       'Northern Nicaragua', 'Burundi', 'Honduras',
       'South-Central Ethiopia', 'Western Panama', 'Philippines',
       'Southern Colombia', 'Yemen', 'Huila Colombia', 'Southern',
       'Uganda', 'Africa', 'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Central America', 'East Malaysia',
       'The Philippines', 'Zambia', 'Southwestern Tanzania', 'Thailand',
       'Northern Tanzania', 'Dominican Republic', 'Sumatra',
       'Central Kenya', 'South-Central Guatemala', 'Western Rwanda',
       'Gedeo Zone', 'Taiwan', 

In [37]:
cardinal_varieties = ['Southern', 'Central', 'Southwestern', 'Southeastern', 'South-Central', 'Souther', 'Southwest'
                    , 'Northern', 'Northwestern', 'Northeastern', 'North-Central', 'Norther', 'Northwest',
                     'Eastern', 'East', 'Far', 'Western', 'West', '-', 'Not Disclosed']
for item in cardinal_varieties:
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, '')
# coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Southern ', '')
coffee_origins['coffee_origin'].unique().tolist()


['Bolivia',
 'Ethiopia',
 ' Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 ' Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 ' Vietnam',
 'Indonesia',
 ' ElSalvador',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Vietnam',
 '',
 'South Kenya',
 ' Nicaragua',
 'Burundi',
 'Honduras',
 'South Ethiopia',
 ' Panama',
 'Philippines',
 ' Colombia',
 'Yemen',
 'Huila Colombia',
 'Uganda',
 'Africa',
 'Apaneca Ilamatepec Mountain Range',
 'Democratic Republic Of Congo',
 'Latin America',
 ' Burundi',
 ' America',
 ' Malaysia',
 'The Philippines',
 'Zambia',
 ' Tanzania',
 'Thailand',
 'Dominican Republic',
 'Sumatra',
 ' Kenya',
 'South Guatemala',
 ' Rwanda',
 'Gedeo Zone',
 'Taiwan',
 ' Peru',
 'Madagascar',
 ' Guatemala',
 'Drc Congo',
 'Nepal',
 'The Congo',
 'China',
 'SouthAfrica',
 '  Panama',
 'South America',
 ' Bolivia',
 'North Ecuador',
 'South Ecuador',
 'North Peru',
 ' Ecuador',
 'Harar (Also Ha

In [38]:
# This data is definitely in need of some stripping again.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.strip()
coffee_origins['coffee_origin'].unique().tolist()
# We're getting closer! 


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 '',
 'South Kenya',
 'Burundi',
 'Honduras',
 'South Ethiopia',
 'Yemen',
 'Huila Colombia',
 'Uganda',
 'Africa',
 'Apaneca Ilamatepec Mountain Range',
 'Democratic Republic Of Congo',
 'Latin America',
 'America',
 'Malaysia',
 'The Philippines',
 'Zambia',
 'Thailand',
 'Dominican Republic',
 'Sumatra',
 'South Guatemala',
 'Gedeo Zone',
 'Taiwan',
 'Madagascar',
 'Drc Congo',
 'Nepal',
 'The Congo',
 'China',
 'SouthAfrica',
 'South America',
 'North Ecuador',
 'South Ecuador',
 'North Peru',
 'Harar (Also Harrar) Growing Region Of  Ethiopia',
 'Highlands,PapuaNewGuinea',
 'Jamaica',
 'North Yemen',
 'Brazil And Four Other Origins',
 'Kenya Gatugi Ab',
 'South Rwanda',
 'Center Colombia',
 'Ethiopia Natural Yirgacheffe Ad

In [39]:
# And while we're at it, I'm going to remove a few of these that don't belong or don't make sense:
nuh_uh = ['Indonesia And  America','Indonesia And CostaRica','15% Sumatra','Puerto Rico','Latin America',
 'Africa', '','America','South America','Brazil And Four Other Origins','Colombia And Ethiopia','And  Africa','Various Latin American Origins','And South America','Asia And Latin America','Usa',]
for item in nuh_uh:
    coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'] == item)]
# coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains('South America', na=False))]
# coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'] == 'Southern')]
coffee_origins['coffee_origin'].unique().tolist()


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'South Kenya',
 'Burundi',
 'Honduras',
 'South Ethiopia',
 'Yemen',
 'Huila Colombia',
 'Uganda',
 'Apaneca Ilamatepec Mountain Range',
 'Democratic Republic Of Congo',
 'Malaysia',
 'The Philippines',
 'Zambia',
 'Thailand',
 'Dominican Republic',
 'Sumatra',
 'South Guatemala',
 'Gedeo Zone',
 'Taiwan',
 'Madagascar',
 'Drc Congo',
 'Nepal',
 'The Congo',
 'China',
 'SouthAfrica',
 'North Ecuador',
 'South Ecuador',
 'North Peru',
 'Harar (Also Harrar) Growing Region Of  Ethiopia',
 'Highlands,PapuaNewGuinea',
 'Jamaica',
 'North Yemen',
 'Kenya Gatugi Ab',
 'South Rwanda',
 'Center Colombia',
 'Ethiopia Natural Yirgacheffe Adado Shara',
 'Colombianar',
 'Province,Rwanda',
 'North Nicaragua',
 'South Kenyamur',
 'India',
 

In [40]:
# Time to start grouping countries we see a lot in this list.
kenyas = coffee_origins[coffee_origins['coffee_origin'].str.contains('enya', na=False)]
kenyas['coffee_origin'].unique().tolist()


['Kenya', 'South Kenya', 'Kenya Gatugi Ab', 'South Kenyamur']

In [41]:
for item in kenyas['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Kenya')
    
coffee_origins['coffee_origin'].unique().tolist()
# Hmmm.. Looks like 'Kenyamur' didn't end up in the kenya list for some reason.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Kenyamur', 'Kenya')


In [42]:
guatemalas = coffee_origins[coffee_origins['coffee_origin'].str.contains('Guatemala', na=False)]
brazils = coffee_origins[coffee_origins['coffee_origin'].str.contains('Brazil', na=False)]
ethiopias = coffee_origins[coffee_origins['coffee_origin'].str.contains('Ethiopia', na=False)]
columbias = coffee_origins[coffee_origins['coffee_origin'].str.contains('Columbia', na=False)]
nicaraguas = coffee_origins[coffee_origins['coffee_origin'].str.contains('Nicaragua', na=False)]
ecuadors = coffee_origins[coffee_origins['coffee_origin'].str.contains('Ecuador', na=False)]
perus = coffee_origins[coffee_origins['coffee_origin'].str.contains('Peru', na=False)]
yemens = coffee_origins[coffee_origins['coffee_origin'].str.contains('Yemen', na=False)]
rwandas = coffee_origins[coffee_origins['coffee_origin'].str.contains('Rwanda', na=False)]
yirgacheffes = coffee_origins[coffee_origins['coffee_origin'].str.contains('Yirgacheffe', na=False)]

for item in guatemalas['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Guatemala')
for item in brazils['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Brazil')
for item in ethiopias['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Ethiopia')
for item in columbias['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Columbia')
for item in nicaraguas['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Nicaragua')
for item in ecuadors['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Ecaudor')
for item in perus['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Peru')
for item in yemens['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Yemen')
for item in rwandas['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Rwanda')
for item in yirgacheffes['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Ethiopia')

coffee_origins['coffee_origin'].unique().tolist()


  coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Ethiopia')


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecaudor',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Burundi',
 'Honduras',
 'Yemen',
 'Huila Colombia',
 'Uganda',
 'Apaneca Ilamatepec Mountain Range',
 'Democratic Republic Of Congo',
 'Malaysia',
 'The Philippines',
 'Zambia',
 'Thailand',
 'Dominican Republic',
 'Sumatra',
 'Gedeo Zone',
 'Taiwan',
 'Madagascar',
 'Drc Congo',
 'Nepal',
 'The Congo',
 'China',
 'SouthAfrica',
 'North Ecaudor',
 'South Ecaudor',
 'Harar (Also Harrar) Growing Region Of  Ethiopia',
 'Highlands,PapuaNewGuinea',
 'Jamaica',
 'Center Colombia',
 'Colombianar',
 'India',
 'Colombia Aromas Del Sur Palestina MicroLot',
 'Zimbabwe',
 'Myanmar',
 'South Colombia',
 'Laos',
 'Highlands Of PapuaNewGuinea',
 'Malawi',
 'Minas Gerais State',
 'Java']

In [43]:
# I missed some alternative spellings!

colombias = coffee_origins[coffee_origins['coffee_origin'].str.contains('Colombia', na=False)]
for item in colombias['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Colombia')
ecaudors = coffee_origins[coffee_origins['coffee_origin'].str.contains('Ecaudor', na=False)]
for item in ecaudors['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Ecuador')
pngs = coffee_origins[coffee_origins['coffee_origin'].str.contains('PapuaNewGuinea', na=False)]
for item in pngs['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'PapuaNewGuinea')
congo = coffee_origins[coffee_origins['coffee_origin'].str.contains('Congo', na=False)]
for item in congo['coffee_origin'].unique().tolist():
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, 'Congo')
    
    
# And a few that need extra attention:
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Harar (Also Harrar) Growing Region Of  Ethiopia', 'Ethiopia')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('The Philippines', 'Philippines')

# And here are a few areas I had to look up:
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Apaneca Ilamatepec Mountain Range', 'ElSalvador')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Minas Gerais State', 'Brazil')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Gedeo Zone', 'Ethiopia')

coffee_origins['coffee_origin'].unique().tolist()


  coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Harar (Also Harrar) Growing Region Of  Ethiopia', 'Ethiopia')


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Burundi',
 'Honduras',
 'Yemen',
 'Uganda',
 'Malaysia',
 'Zambia',
 'Thailand',
 'Dominican Republic',
 'Sumatra',
 'Taiwan',
 'Madagascar',
 'Nepal',
 'China',
 'SouthAfrica',
 'North Ecuador',
 'South Ecuador',
 'Harar (Also Harrar) Growing Region Of  Ethiopia',
 'Jamaica',
 'India',
 'Zimbabwe',
 'Myanmar',
 'Laos',
 'Malawi',
 'Java']

In [44]:
# Hmm..
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Dominican Republic', 'DominicanRepublic')

coffee_origins['coffee_origin'].unique().tolist()


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Burundi',
 'Honduras',
 'Yemen',
 'Uganda',
 'Malaysia',
 'Zambia',
 'Thailand',
 'DominicanRepublic',
 'Sumatra',
 'Taiwan',
 'Madagascar',
 'Nepal',
 'China',
 'SouthAfrica',
 'North Ecuador',
 'South Ecuador',
 'Harar (Also Harrar) Growing Region Of  Ethiopia',
 'Jamaica',
 'India',
 'Zimbabwe',
 'Myanmar',
 'Laos',
 'Malawi',
 'Java']

In [45]:
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.split(' ')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str[-1]

coffee_origins['coffee_origin'].unique().tolist()


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'PapuaNewGuinea',
 'Philippines',
 'Brazil',
 'Peru',
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Congo',
 'CostaRica',
 'Ecuador',
 'Vietnam',
 'Indonesia',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Burundi',
 'Honduras',
 'Yemen',
 'Uganda',
 'Malaysia',
 'Zambia',
 'Thailand',
 'DominicanRepublic',
 'Sumatra',
 'Taiwan',
 'Madagascar',
 'Nepal',
 'China',
 'SouthAfrica',
 'Jamaica',
 'India',
 'Zimbabwe',
 'Myanmar',
 'Laos',
 'Malawi',
 'Java']

In [46]:
# Yay! Now I'll un-squash all of the countries with multi-word names and save this as a csv.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('DominicanRepublic', 'Dominican Republic')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('SouthAfrica', 'South Africa')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('CostaRica', 'Costa Rica')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('ElSalvador', 'El Salvador')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('PapuaNewGuinea', 'Papua New Guinea')
coffee_origins.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...",Bolivia,30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...",Ethiopia,28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...",Ethiopia,20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...",Colombia,18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,"Deeply sweet-tart, chocolate-toned. Dark choco...",Guatemala,16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [47]:
coffee_origins['title'] = coffee_origins['title'].str.replace(',', '')
coffee_origins['blind_assessment'] = coffee_origins['blind_assessment'].str.replace(',', '')
coffee_origins['roaster'] = coffee_origins['roaster'].str.replace(',', '')
coffee_origins['roaster_location'] = coffee_origins['roaster_location'].str.replace(',', '')
coffee_origins.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,Richly aromatic floral-toned. Magnolia cocoa n...,Bolivia,30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,Floyd Virginia,2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,Delicately aromatic complex. Lilac cocoa nib p...,Ethiopia,28.00/12 ounces,Medium-Light,Mostra Coffee,San Diego California,2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,High-toned fruit-driven. Boysenberry pear coco...,Ethiopia,20.50/12 ounces,Medium-Light,Regent Coffee,Glendale California,1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,Fruit-driven crisply chocolaty. Goji berry dri...,Colombia,18.99/8 ounces,Medium-Light,Merge Coffee Company,Harrisonburg Virginia,2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,Deeply sweet-tart chocolate-toned. Dark chocol...,Guatemala,16.00/12 ounces,Medium-Light,El Gran Cafe,Antigua Guatemala,1.333333


In [48]:
coffee_origins.to_csv('coffee_origins_tableau.csv')

