# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests, re
from bs4 import BeautifulSoup 

# Brief Introduction

The intention of creating this OCR tool is to help users recommend related coffees based on the image of the coffee label that they just scanned. For this project, I have decided to use Sweet Marias (one of the largest online retailer of coffee beans in the US) as they have a comprehensive list of coffee beans listed on their website (~350 different types).

For this notebook, I will be performing a web-scrape of all the coffee and their data from Sweet Marias' website. The information that I will scrape are: **1) name of coffee, 2) URL, 3) Description, 4) Processing Method and 5) Type**.

# Scrape Sweet Marias

In [2]:
# def make_soup(url):
#     '''create soup from URL'''
    
#     res = requests.get(url)
#     try:
#         res.raise_for_status()
#     except requests.exceptions.HTTPError as e:
#         return 'Error: ' + str(e)
    
#     soup = BeautifulSoup(res.content)
#     return soup

In [3]:
def get_links(url):
    '''get the links of each coffee to scrape'''
    
    res = requests.get(url)
    try:
        res.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return 'Error: ' + str(e)
    
    soup = BeautifulSoup(res.content)
    links = soup.find_all('div', {'class':'product description product-item-description'})
    
    url_list = []
    for link in links:
        for a_tags in link.find_all('a'):
            url = a_tags.get('href')
            url = url.replace(' ','%20')
            url_list.append(url)
    
    return url_list

In [4]:
def get_details(link):
    '''scrape details of each coffee after getting the link'''
    
    res = requests.get(link)
    try:
        res.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return 'Error: ' + str(e)
        
    soup = BeautifulSoup(res.content)
    item = soup.find_all('div', {'class':'value'})
    name = soup.find_all('h1', {'class': 'page-title'})
    #price_search = soup.find_all('div', {'class':'product-info-price'})
    
    try:
        coffee = name[0].find_all('span')[0].text
        #price = price_search[0].find_all('span', {'class':'price'})[0].text
        description = item[0].find('p').text
        process = item[1].find_all('div',{'class':'list-info'})[0].find_all('span')[0].text.replace(' Process','')
        variety = item[1].find_all('div',{'class':'list-info'})[0].find_all('span')[1].text.replace(' Types','').replace('Yes', '').replace('No', '')
    except:
        coffee = np.nan
        #price = np.nan
        description = np.nan
        process = np.nan
        variety = np.nan
    
    details = [coffee, link, description, process, variety]
    return details

In [5]:
get_details('https://www.sweetmarias.com/burundi-commune-mutambu-6644.html')

['Burundi Commune Mutambu',
 'https://www.sweetmarias.com/burundi-commune-mutambu-6644.html',
 'Such a versatile Burundi, a neutral sweetness is accented by complex baking spices, creamed honey, loose leaf black tea and bittering cocoa when roasted dark. City to Full City+. Good for espresso.',
 'Wet',
 'Bourbon']

In [6]:
#Get data from Sweet Marias 
links = get_links('https://www.sweetmarias.com/green-coffee.html?product_list_limit=all&sm_status=2')

data = []
for link in links:
    data.append(get_details(link))
    
data = np.array(data)

In [7]:
df_sweetmarias = pd.DataFrame(data, columns=['Name','URL','Description','Process','Variety'])

In [8]:
df_sweetmarias.drop(df_sweetmarias.loc[df_sweetmarias['Description']=='nan',:].index, inplace=True)
df_sweetmarias.reset_index(drop=True, inplace=True)

In [9]:
df_sweetmarias.to_csv('sweetmarias.csv', index=False)

## Data Cleaning - Sweet Marias

### Processing Method

In [10]:
df_sweetmarias['Process'].value_counts()

Wet                   165
Wet (Washed)           69
Dry                    37
Dry (Natural)          16
Machine Washed         12
Honey                  11
Otheres                 8
Water Decaf             8
Wet Hulled              7
Giling Basah            6
Pulp Natural            4
Wet then SWP Decaf      3
Name: Process, dtype: int64

Combine 'Wet' and 'Washed' / 'Dry' and 'Natural' in the same categories

In [11]:
df_sweetmarias['Process'] = df_sweetmarias['Process'].str.replace(r'[(|)]','', regex=True)

In [12]:
df_sweetmarias['Process'].replace('Wet','Wet Washed', inplace=True)
df_sweetmarias['Process'].replace('Dry','Dry Natural', inplace=True)

df_sweetmarias['Process'] = df_sweetmarias['Process'].str.replace('Decaf','Decaffeinated', regex=True)

Remove 'Otheres' in 'Process'

In [13]:
df_sweetmarias['Process'] = df_sweetmarias['Process'].replace('Otheres', '')

In [21]:
df_sweetmarias

Unnamed: 0,Name,URL,Description,Process,Variety
0,Burundi Commune Mutambu,https://www.sweetmarias.com/burundi-commune-mu...,"Such a versatile Burundi, a neutral sweetness ...",Wet Washed,Bourbon
1,Burundi Dry Process Gaterama Agahore,https://www.sweetmarias.com/burundi-dry-proces...,"Unapologetic ""dry process"" flavor that should ...",Dry Natural,Bourbon
2,Burundi Dry Process Kibingo,https://www.sweetmarias.com/burundi-dry-proces...,Cooked fruit and wheat flavors that bring to m...,Dry Natural,Bourbon
3,Burundi Honey Process Gahahe,https://www.sweetmarias.com/burundi-honey-proc...,"Sweet, clean cup character like wet process Bu...",Honey,Bourbon
4,Burundi Kabarore Commune Yandaro,https://www.sweetmarias.com/burundi-kabarore-c...,"Lighter roasts draw out potent aromatic, like ...",Wet Washed,Bourbon
...,...,...,...,...,...
339,Peru Jaen Granjeros de Huabal,https://www.sweetmarias.com/peru-jaen-granjero...,"Versatile, a sweet cup from light to dark roas...",Wet Washed,"Caturra, Bourbon, Typica, Modern Hybrids"
340,Peru Nuevo Trujillo Marcial Olivera,https://www.sweetmarias.com/peru-nuevo-trujill...,"One of our brighter Peru's, City roasts produc...",Wet Washed,"Caturra, Bourbon, Typica, Modern Hybrids"
341,Peru Pueblo de Piñas,https://www.sweetmarias.com/peru-pueblo-de-pin...,"Exemplary of the flavor balance found in Peru,...",Wet Washed,"Bourbon, Modern Hybrids"
342,Sumatra Raja Batak Peaberry,https://www.sweetmarias.com/sumatra-raja-batak...,"A wet-hulled with classic Lintong character, m...",Giling Basah,Modern Hybrids


### Remove Coffee Blends

Remove coffee blends (since we are focussing on single origins)

In [14]:
df_sweetmarias.loc[df_sweetmarias['Name'].str.contains('Blend'),:]

Unnamed: 0,Name,URL,Description,Process,Variety
344,Sweet Maria's Moka Kadir Blend,https://www.sweetmarias.com/catalog/product/vi...,"Rich chocolate flavors come through in layers,...",Dry Natural,Varies
345,Sweet Maria's Polar Expresso Holiday Blend,https://www.sweetmarias.com/sweet-marias-polar...,Roll out the holiday cheer with our limited ed...,,"Bourbon, Heirloom"


In [15]:
df_sweetmarias.drop(df_sweetmarias.loc[df_sweetmarias['Name'].str.contains('Blend'),:].index, inplace=True)
df_sweetmarias.reset_index(drop=True, inplace=True)

Replace NaNs with ''

In [16]:
df_sweetmarias.fillna('', inplace=True)

# Scrape Monmouth

In [10]:
def get_details_monmouth(link):
    '''scrape details of each coffee after getting the link'''
    
    res = requests.get(link)
    try:
        res.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return 'Error: ' + str(e)
    
    soup = BeautifulSoup(res.content)
    details = soup.find_all('li', {'class':'coffee_wide'})
    
    all_details = []
    for i in range(len(details)):
    
        try:
            country = details[i].find_all('div')[0].find_all('p', {'class':'country'})[0].text
            farm = details[i].find_all('div')[0].find_all('h2')[0].text
            url = details[i].get('onclick').split("'")[1]
            description = details[i].find_all('div')[0].find_all('div', {'class':'col middle'})[0].find_all('p')[0].text
            process = details[i].find_all('div')[0].find_all('p', {'class':'process'})[0].text.replace(' Process','')
            variety = details[i].find_all('div')[0].find_all('p', {'class':'varietal'})[0].text
        except:
            country = np.nan
            farm = np.nan
            url = np.nan
            description = np.nan
            process = np.nan
            variety = np.nan
    
        all_details.append([country, farm, url, description, process, variety])
    return all_details

In [12]:
monmouth_current_data = np.array(get_details_monmouth('https://www.monmouthcoffee.co.uk/our-coffees/'))
monmouth_archive_data = np.array(get_details_monmouth('https://www.monmouthcoffee.co.uk/past-coffee/'))

In [20]:
df_monmouth_current_data = pd.DataFrame(monmouth_current_data, columns=['Country','Farm','URL', 'Description','Process','Variety'])
df_monmouth_archive_data = pd.DataFrame(monmouth_archive_data, columns=['Country','Farm','URL', 'Description','Process','Variety'])

In [273]:
df_monmouth = pd.concat([df_monmouth_current_data,df_monmouth_archive_data]).reset_index(drop=True)

In [274]:
df_monmouth.drop(df_monmouth.loc[df_monmouth['Description']=='nan',:].index, inplace=True)
df_monmouth.reset_index(drop=True, inplace=True)

In [275]:
df_monmouth.to_csv('monmouth.csv', index=False)

## Data Cleaning - Monmouth

### Processing Method

Aligning the main processing method categories as per those in Sweet Marias'.

In [276]:
df_monmouth['Process'] = df_monmouth['Process'].str.replace('Traditional Washed','Wet Washed', regex=True)
df_monmouth['Process'] = df_monmouth['Process'].str.replace('Natural','Dry Natural', regex=True)
df_monmouth['Process'] = df_monmouth['Process'].str.replace('Mechanical Washed','Machine Washed', regex=True)
df_monmouth['Process'] = df_monmouth['Process'].str.replace('Decaffeination','Decaffeinated', regex=True)

In [277]:
df_monmouth['Country'] = df_monmouth['Country'].str.lower()

In [278]:
df_monmouth['Country'].replace('brasil','brazil', inplace=True)

### Combine Country and Farm to Name

This will help us us match the format of the monmouth dataframe to the sweet marias dataframe.

In [290]:
df_monmouth['Name'] = df_monmouth['Country'] + ' ' + df_monmouth['Farm']

In [298]:
name = df_monmouth['Name']
df_monmouth.drop(columns=['Name','Country','Farm'], axis=1, inplace=True)
df_monmouth.insert(0, 'Name', name)

## Create new columns of combined strings (for Part 2)

In [22]:
df_sweetmarias['all_combined'] = df_sweetmarias[['Name','Description','Process','Variety']].agg(', '.join, axis=1)

In [23]:
df_sweetmarias['exlc_name_combined'] = df_sweetmarias[['Description','Process','Variety']].agg(', '.join, axis=1)

In [24]:
df_sweetmarias.to_csv('cleaned_sweetmarias.csv', index=False)

In [319]:
df_monmouth['all_combined'] = df_monmouth[['Name','Description','Process','Variety']].agg(', '.join, axis=1)

In [320]:
df_monmouth['exlc_name_combined'] = df_monmouth[['Description','Process','Variety']].agg(', '.join, axis=1)

In [321]:
df_monmouth.to_csv('cleaned_monmouth.csv', index=False)

In [322]:
df_coffees = pd.concat([df_monmouth, df_sweetmarias]).reset_index(drop=True)

In [325]:
df_coffees.to_csv('cleaned_coffees.csv', index=False)