In [1]:
import os
import requests

def download_image(url, pathname, name):
    """
    Downloads a file given an URL and puts it in the folder `pathname`
    """
    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)
    # get the file name
    filename = os.path.join(pathname, name.strip() + '.jpg')
    with open(filename, "wb") as f:
        # write data read to the file
        f.write(response.content)

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

site = "https://www.mcsuk.org"
page = requests.get("https://www.mcsuk.org/goodfishguide/search?name=&start=31&perPage=134&min=0&max=5&mp=all&cm=all&tof=all")
soup = BeautifulSoup(page.content, 'html.parser')

fishes = {}
for fish in soup.find_all(class_ = 'fish'):
    try:
        key = fish.find('strong').get_text() + " " 
        for string in fish.find('h3').get_text().split(',')[1:]:
            key += string.strip() + " "
    except:
        key = fish.find('strong').get_text()
        
    fishes[key] = fish.find(class_ = 'sustainability_rating').find('img').get('alt')
    # Downloading images
    img = fish.find('img')
    url = img['src']
    filename = re.search(r'/images/goodfishguide/([\w_-]+[.](jpg|gif|png))$', url)
    if not filename:
         continue
    url = '{}{}'.format(site, url)
    download_image(url, './images', key)

In [3]:
df_rating = pd.DataFrame(fishes, index=['Rating']).transpose().reset_index().rename(columns={'index': 'Fish'})

decode = {'one': '1', 
          'two': '2',
          'three': '3', 
          'four': '4', 
          'five': '5', 
          'from one to two': '1-2',
          'from one to three': '1-3',
          'from one to four': '1-4', 
          'from one to five': '1-5', 
          'from two to three' : '2-3', 
          'from two to four': '2-4',
          'from two to five': '2-5', 
          'from three to four': '3-4', 
          'from three to five': '3-5', 
          'from four to five': '4-5', 
          'three with at least one rating under review': '3*',
          'from two to five with at least one rating under review': '2-5*',
          'rating under review': 'rating under review',
         }

df_rating['Sustainability rating'] = df_rating.apply(lambda x: decode[x.Rating], axis=1)

In [4]:
from google_trans_new import google_translator  
translator = google_translator()  

df_rating['Pesce'] = df_rating.apply(lambda x: translator.translate(x.Fish, lang_tgt='it'), axis=1)

In [5]:
from translate import Translator
translator = Translator(to_lang="it", from_lang='en')
translator.translate("This is a pen.")

'Questa è una penna.'

In [6]:
df_rating.to_csv('sustainable_fish.csv', index=False)