# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests, re
from bs4 import BeautifulSoup 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity

# Brief Introduction

The intention of creating this OCR tool is to help users recommend related coffees based on the image of the coffee label that they just scanned. For this project, I have decided to use Sweet Marias (one of the largest online retailer of coffee beans in the US) as they have a comprehensive list of coffee beans listed on their website (~350 different types).

For this notebook, I will be performing a web-scrape of all the coffee and their data from Sweet Marias' website. The information that I will scrape are: **1) name of coffee, 2) URL, 3) Description, 4) Processing Method and 5) Type**.

# Functions

In [4]:
def make_soup(url):
    '''create soup from URL'''
    
    res = requests.get(url)
    try:
        res.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return 'Error: ' + str(e)
    
    soup = BeautifulSoup(res.content)
    return soup

In [5]:
def get_links(url):
    '''get the links of each coffee to scrape'''
    
    soup = make_soup(url)
    links = soup.find_all('div', {'class':'product description product-item-description'})
    
    url_list = []
    for link in links:
        for a_tags in link.find_all('a'):
            url = a_tags.get('href')
            url = url.replace(' ','%20')
            url_list.append(url)
    
    return url_list

In [8]:
def get_details(link):
    '''scrape details of each coffee after getting the link'''
    
    res = requests.get(link)
    try:
        res.raise_for_status()
    except requests.exceptions.HTTPError as e:
        return 'Error: ' + str(e)
        
    soup = make_soup(link)
    item = soup.find_all('div', {'class':'value'})
    name = soup.find_all('h1', {'class': 'page-title'})
    
    try:
        coffee = name[0].find_all('span')[0].text
        description = item[0].find('p').text
        process = item[1].find_all('div',{'class':'list-info'})[0].find_all('span')[0].text.replace(' Process','')
        variety = item[1].find_all('div',{'class':'list-info'})[0].find_all('span')[1].text.replace(' Types','').replace('Yes', '').replace('No', '')
    except:
        coffee = np.nan
        description = np.nan
        process = np.nan
        variety = np.nan
    
    details = [coffee, link, description, process, variety]
    return details

In [10]:
#Get data from Sweet Marias 
links = get_links('https://www.sweetmarias.com/green-coffee.html?product_list_limit=all&sm_status=2')

data = []
for link in links:
    data.append(get_details(link))
    
data = np.array(data)

In [11]:
df_sweetmarias = pd.DataFrame(data, columns=['Name','URL','Description','Process','Variety'])
df_sweetmarias

Unnamed: 0,Name,URL,Description,Process,Variety
0,Burundi Commune Mutambu,https://www.sweetmarias.com/burundi-commune-mu...,"Such a versatile Burundi, a neutral sweetness ...",Wet,Bourbon
1,Burundi Dry Process Gaterama Agahore,https://www.sweetmarias.com/burundi-dry-proces...,"Unapologetic ""dry process"" flavor that should ...",Dry,Bourbon
2,Burundi Dry Process Kibingo,https://www.sweetmarias.com/burundi-dry-proces...,Cooked fruit and wheat flavors that bring to m...,Dry,Bourbon
3,Burundi Honey Process Gahahe,https://www.sweetmarias.com/burundi-honey-proc...,"Sweet, clean cup character like wet process Bu...",Honey,Bourbon
4,Burundi Kabarore Commune Yandaro,https://www.sweetmarias.com/burundi-kabarore-c...,"Lighter roasts draw out potent aromatic, like ...",Wet,Bourbon
...,...,...,...,...,...
351,,https://www.sweetmarias.com/roasted-coffee-jav...,,,
352,,https://www.sweetmarias.com/roasted%20coffee%2...,,,
353,,https://www.sweetmarias.com/roasted-coffee-rwa...,,,
354,,https://www.sweetmarias.com/roasted-espresso-a...,,,


In [12]:
df_sweetmarias.drop(df_sweetmarias.loc[df_sweetmarias['Description']=='nan',:].index, inplace=True)
df_sweetmarias.reset_index(drop=True, inplace=True)

In [None]:
df_sweetmarias.to_csv('sweetmarias.csv', index=False)

# Data Cleaning

## Processing Method

In [27]:
df_sweetmarias['Process'].value_counts()

Wet Washed            229
Dry Natural            54
Machine Washed         14
Honey                  10
Water Decaf            10
                        8
Wet Hulled              7
Giling Basah            6
Pulp Natural            4
Wet then SWP Decaf      3
Name: Process, dtype: int64

Combine 'Wet' and 'Washed' / 'Dry' and 'Natural' in the same categories

In [38]:
df_sweetmarias['Process'].replace('Wet','Wet Washed', inplace=True)
df_sweetmarias['Process'].replace('Wet (Washed)','Wet Washed', inplace=True)

df_sweetmarias['Process'].replace('Dry','Dry Natural', inplace=True)
df_sweetmarias['Process'].replace('Dry (Natural)','Dry Natural', inplace=True)

Remove 'Otheres' in 'Process'

In [26]:
df_sweetmarias['Process'] = df_sweetmarias['Process'].replace('Otheres', '')

## Misc. Data Cleaning

Remove coffee blends (since we are focussing on single origins)

In [34]:
df_sweetmarias.loc[df_sweetmarias['Name'].str.contains('Blend'),:]

Unnamed: 0,Name,URL,Description,Process,Variety
343,Sweet Maria's Moka Kadir Blend,https://www.sweetmarias.com/catalog/product/vi...,"Rich chocolate flavors come through in layers,...",Dry Natural,Varies
344,Sweet Maria's Polar Expresso Holiday Blend,https://www.sweetmarias.com/sweet-marias-polar...,Roll out the holiday cheer with our limited ed...,,"Bourbon, Heirloom"


In [36]:
df_sweetmarias.drop(df_sweetmarias.loc[df_sweetmarias['Name'].str.contains('Blend'),:].index, inplace=True)
df_sweetmarias.reset_index(drop=True, inplace=True)

## Create new columns of combined strings (for Part 2)

In [46]:
df_sweetmarias['all_combined'] = df_sweetmarias[['Name','Description','Process','Variety']].agg(', '.join, axis=1)

In [47]:
df_sweetmarias['exlc_name_combined'] = df_sweetmarias[['Name','Description','Process','Variety']].agg(', '.join, axis=1)

In [49]:
df_sweetmarias.to_csv('cleaned_sweetmarias.csv', index=False)