In [1]:
# The goal of this notebook will be to clean the 'coffee_origin' data in order to focus on 
# countries of origin for sourcing our coffee beans.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

coffee_origins = pd.read_csv('clean_coffee.csv', index_col=False)
coffee_origins = coffee_origins.drop('Unnamed: 0', axis=1)
coffee_origins.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,"Deeply sweet-tart, chocolate-toned. Dark choco...","San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [2]:
# It looks like the 'coffee_origin' column tends to have the country name as the last item listed.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.split(', ')
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str[-1]
coffee_origins['coffee_origin'].head(15)


0                   Bolivia
1                  Ethiopia
2         southern Ethiopia
3                  Colombia
4                 Guatemala
5                  Ethiopia
6                 Guatemala
7                  Colombia
8          Papua New Guinea
9         southern Ethiopia
10     northern Philippines
11                 Colombia
12                 Colombia
13        southern Ethiopia
14    Big Island of Hawai’i
Name: coffee_origin, dtype: object

In [3]:
coffee_origins['coffee_origin'].unique()
# Whew! That's still a lot! Time to clean it up.


array(['Bolivia', 'Ethiopia', 'southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'northern Philippines',
       'Big Island of Hawai’i', 'Brazil', 'Peru', 'Ethiopia; Sumatra',
       'El Salvador', 'Southern Ethiopia', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic of the Congo', 'Costa Rica', 'Ecuador',
       'southwest Vietnam', 'Indonesia', 'central El Salvador',
       'Nicaragua', '“Big Island” of Hawai’i', 'Mexico', 'Rwanda',
       'Haiti', 'Dominican Republic; Honduras', 'Vietnam',
       'Thailand; Vietnam; Latin America', 'Not disclosed',
       'south-central Kenya', 'northern Nicaragua', 'Burundi',
       'Guatemala; Nicaragua; Ethiopia; Costa Rica; Indonesia; Brazil',
       'Colombia; Ethiopia', 'Honduras', 'Colombia; Guatemala; Mexico',
       'Guatemala; Ethiopia; Colombia', 'south-central Ethiopia',
       'western Panama', 'Philippines', 'southern Colombia',
       'Colombia; Guatemala; Costa Rica; Ethiopia', 'Yemen',
       'Huila C

In [4]:
# I'll start with stripping any whitespace and putting everything in title case.
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.strip().str.title()
coffee_origins['coffee_origin'].unique()



array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', 'Ethiopia; Sumatra',
       'El Salvador', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic Of The Congo', 'Costa Rica', 'Ecuador',
       'Southwest Vietnam', 'Indonesia', 'Central El Salvador',
       'Nicaragua', '“Big Island” Of Hawai’I', 'Mexico', 'Rwanda',
       'Haiti', 'Dominican Republic; Honduras', 'Vietnam',
       'Thailand; Vietnam; Latin America', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi',
       'Guatemala; Nicaragua; Ethiopia; Costa Rica; Indonesia; Brazil',
       'Colombia; Ethiopia', 'Honduras', 'Colombia; Guatemala; Mexico',
       'Guatemala; Ethiopia; Colombia', 'South-Central Ethiopia',
       'Western Panama', 'Philippines', 'Southern Colombia',
       'Colombia; Guatemala; Costa Rica; Ethiopia', 'Yemen',
       'Huila Colombia', 'Southern',

In [10]:
# It looks like many of these are blends of beans from several different regions, separated
# by a ';'. I'm not going to include those in my analysis, so I'll filter them out.
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains(';', na=False))]
coffee_origins['coffee_origin'].unique()


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', nan, 'El Salvador',
       'Tanzania', 'Kenya', 'Panama', 'Democratic Republic Of The Congo',
       'Costa Rica', 'Ecuador', 'Southwest Vietnam', 'Indonesia',
       'Central El Salvador', 'Nicaragua', '“Big Island” Of Hawai’I',
       'Mexico', 'Rwanda', 'Haiti', 'Vietnam', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi', 'Honduras',
       'South-Central Ethiopia', 'Western Panama', 'Philippines',
       'Southern Colombia', 'Yemen', 'Huila Colombia', 'Southern',
       'Uganda', 'Africa', 'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Hawaii', 'Big Island Of Hawai‘I',
       'Central America', 'East Malaysia', 'The Philippines', 'Zambia',
       'Southwestern Tanzania', 'Thailand', 'Northern Tanzania',
  

In [11]:
# Many of them have a trailing '.'
coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('.', '')
coffee_origins['coffee_origin'].unique()


  coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('.', '')


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines',
       'Big Island Of Hawai’I', 'Brazil', 'Peru', nan, 'El Salvador',
       'Tanzania', 'Kenya', 'Panama', 'Democratic Republic Of The Congo',
       'Costa Rica', 'Ecuador', 'Southwest Vietnam', 'Indonesia',
       'Central El Salvador', 'Nicaragua', '“Big Island” Of Hawai’I',
       'Mexico', 'Rwanda', 'Haiti', 'Vietnam', 'Not Disclosed',
       'South-Central Kenya', 'Northern Nicaragua', 'Burundi', 'Honduras',
       'South-Central Ethiopia', 'Western Panama', 'Philippines',
       'Southern Colombia', 'Yemen', 'Huila Colombia', 'Southern',
       'Uganda', 'Africa', 'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Hawaii', 'Big Island Of Hawai‘I',
       'Central America', 'East Malaysia', 'The Philippines', 'Zambia',
       'Southwestern Tanzania', 'Thailand', 'Northern Tanzania',
  

In [14]:
# Lots of these are hawaii...
coffee_hawaii = coffee_origins.loc[coffee_origins['coffee_origin'].str.contains('Hawai', na=False)]
coffee_hawaii['coffee_origin'].unique().tolist()


['Big Island Of Hawai’I',
 '“Big Island” Of Hawai’I',
 'Hawaii',
 'Big Island Of Hawai‘I',
 "“Big Island” Of Hawai'I",
 "Big Island Of Hawai'I",
 '“Big Island” Of Hawaii',
 'Big Island Of Hawaii',
 'Hawai’I',
 '"Big Island" Of Hawai’I',
 "Hawai'I",
 '"Big Island" Of Hawaii',
 'Hawaiian Islands',
 'Southwestern Corner Of The "Big Island" Of Hawaii']

In [15]:
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains('Hawai', na=False))]
coffee_origins['coffee_origin'].unique()
# Great! We're getting to the point now where I should start removing the 'Southern's and 'West-Central's and all of that.


array(['Bolivia', 'Ethiopia', 'Southern Ethiopia', 'Colombia',
       'Guatemala', 'Papua New Guinea', 'Northern Philippines', 'Brazil',
       'Peru', nan, 'El Salvador', 'Tanzania', 'Kenya', 'Panama',
       'Democratic Republic Of The Congo', 'Costa Rica', 'Ecuador',
       'Southwest Vietnam', 'Indonesia', 'Central El Salvador',
       'Nicaragua', 'Mexico', 'Rwanda', 'Haiti', 'Vietnam',
       'Not Disclosed', 'South-Central Kenya', 'Northern Nicaragua',
       'Burundi', 'Honduras', 'South-Central Ethiopia', 'Western Panama',
       'Philippines', 'Southern Colombia', 'Yemen', 'Huila Colombia',
       'Southern', 'Uganda', 'Africa',
       'Apaneca Ilamatepec Mountain Range',
       'Democratic Republic Of Congo', 'Latin America',
       'Northern Burundi', 'Central America', 'East Malaysia',
       'The Philippines', 'Zambia', 'Southwestern Tanzania', 'Thailand',
       'Northern Tanzania', 'Dominican Republic', 'Sumatra',
       'Central Kenya', 'South-Central Guatemala', 'West

In [16]:
# Let's look at all of the entries that contain 'South'.
south = coffee_origins[coffee_origins['coffee_origin'].str.contains('South', na=False)]
south['coffee_origin'].unique().tolist()


['Southern Ethiopia',
 'Southwest Vietnam',
 'South-Central Kenya',
 'South-Central Ethiopia',
 'Southern Colombia',
 'Southern',
 'Southwestern Tanzania',
 'South-Central Guatemala',
 'Southern Rwanda',
 'Southwestern Kenya',
 'Southwestern Ethiopia',
 'Southwestern Colombia',
 'South Africa',
 'Southeastern Ethiopia',
 'South America',
 'South-Eastern Ecuador',
 'Southern Ecuador',
 'Southeastern Brazil',
 'Southeastern El Salvador',
 'Southern Costa Rica',
 'South-Central Rwanda',
 'Southwestern Colombianar',
 'Southwest Ethiopia',
 'Southwest Uganda',
 'Southern Province,Rwanda',
 'South-Central Kenyamur',
 'Southern India',
 'South-Central Colombia',
 'Southwestern Mexico',
 'Southern And Western Ethiopia',
 'Southern Laos',
 'Souther Ethiopia',
 'Souther Ecuador',
 'South-Central Brazil',
 'Southern Tanzania',
 'Southern Peru',
 'Southeastern Mexico',
 'Southeastern Peru',
 'Southern Malawi',
 'South Central Guatemala',
 'Southwestern Guatemala',
 'Central And South America',
 'S

In [28]:
cardinal_varieties = ['Souther', 'Central', 'Southwestern', 'Southeastern', 'South-Central', 'Southern', 'Southwest']
for item in cardinal_varieties:
    coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(item, '')
# coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace('Southern ', '')
coffee_origins['coffee_origin'].unique().tolist()


['Bolivia',
 'Ethiopia',
 'Colombia',
 'Guatemala',
 'Papua New Guinea',
 'Northern Philippines',
 'Brazil',
 'Peru',
 nan,
 'ElSalvador',
 'Tanzania',
 'Kenya',
 'Panama',
 'Democratic Republic Of The Congo',
 'Costa Rica',
 'Ecuador',
 ' Vietnam',
 'Indonesia',
 ' ElSalvador',
 'Nicaragua',
 'Mexico',
 'Rwanda',
 'Haiti',
 'Vietnam',
 'Not Disclosed',
 'South- Kenya',
 'Northern Nicaragua',
 'Burundi',
 'Honduras',
 'South- Ethiopia',
 'Western Panama',
 'Philippines',
 'Yemen',
 'Huila Colombia',
 'Uganda',
 'Africa',
 'Apaneca Ilamatepec Mountain Range',
 'Democratic Republic Of Congo',
 'Latin America',
 'Northern Burundi',
 ' America',
 'East Malaysia',
 'The Philippines',
 'Zambia',
 ' Tanzania',
 'Thailand',
 'Northern Tanzania',
 'Dominican Republic',
 'Sumatra',
 ' Kenya',
 'South- Guatemala',
 'Western Rwanda',
 'Gedeo Zone',
 'Taiwan',
 'Northern Peru',
 ' Ethiopia',
 'Madagascar',
 ' Colombia',
 'Northern Guatemala',
 'Drc Congo',
 'Nepal',
 ' Rwanda',
 'The Democratic Rep

In [29]:
# Since only a few countries have two-word names, I'm going to temporarily convert them to one-word names
# until the end of this notebook.
double_dictionary = {'El Salvador': 'ElSalvador', 'Costa Rica': 'CostaRica', 'Papua New Guinea' : 'PapuaNewGuinea', 'Democratic Republic Of The Congo' : 'Congo'}

coffee_origins['coffee_origin'] = coffee_origins['coffee_origin'].str.replace(double_dictionary)
# And while we're at it, I'm going to remove a few of these that don't belong or don't make sense:
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'].str.contains('South America', na=False))]
coffee_origins = coffee_origins[~(coffee_origins['coffee_origin'] == 'Southern')]
coffee_origins['coffee_origin'].unique()


TypeError: replace() missing 1 required positional argument: 'repl'