## Web Scraping with Mechanical Soup

> Build a Database of House and Offices Photos Automatically

In [13]:
import mechanicalsoup as ms

browser = ms.StatefulBrowser()
url = "https://images.google.com/"

browser.open(url)
print(browser.get_url())

https://images.google.com/


In [14]:
# get HTML
browser.get_current_page()

# target the search input
browser.select_form()
browser.get_current_form().print_summary()

# search for a term
search_term = "office"
browser["q"] = search_term

# submit search
browser.launch_browser()
response = browser.submit_selected()

print("new url:", browser.get_url())
print("my response:\n", response.text[:500])

<input name="tbm" type="hidden" value="isch"/>
<input name="ie" type="hidden" value="ISO-8859-1"/>
<input name="hl" type="hidden" value="id"/>
<input name="source" type="hidden" value="hp"/>
<input name="biw" type="hidden"/>
<input name="bih" type="hidden"/>
<input autocomplete="off" class="lst tiah" maxlength="2048" name="q" size="57" style="margin:0;padding:5px 8px 0 6px;vertical-align:top;color:#000;padding-right:38px" title="Telusuri Gambar" value=""/>
<input class="lsb" name="btnG" type="submit" value="Telusuri Gambar"/>
<input id="gbv" name="gbv" type="hidden" value="1"/>
new url: https://www.google.com/search?tbm=isch&ie=ISO-8859-1&hl=id&source=hp&biw=&bih=&q=office&btnG=Telusuri+Gambar&gbv=1
my response:
 <!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="id"><head><meta content="application/xhtml+xml; charset=UTF-8" http-equiv="Content-Type"/><meta content="no-cache" na

In [15]:
# open new url
new_url = browser.get_url()
browser.open(new_url)

<Response [200]>

In [16]:
# get HTML
page = browser.get_current_page()
all_images = page.find_all("img")

all_images

[<img alt="Google" class="kgJEQe" src="/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRIVArSGPi8MDZitp_uc79ptLhg6lA2Lc6p8fMGppGELDDvzEPr3y6CuRzhyfI&amp;s"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTeaFYJjeoc3Fm1TR-2SUxU3PHeb4eLc3vXLvOM5ZbkAasQslENAX4_dCVNzmE&amp;s"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTrgB9nlN2zZiXgV-My4UeS6p42X74gxgomeHOl0nfxqLmMFmloqxQ5dQ0B2-s&amp;s"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRWXOQ2Bf50RLW6UWigQpuVIFTIGMKnLfRVITQmTwT1IIJTfheOTXGDlZCBV6M&amp;s"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTomST8obcYY0gzGOhboi10kF9dGo-9qx0d277S8KUbh5tul_tXQnAT-WQGXxM&amp;s"/>,
 <img alt="" class="yWs4tf" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9Gc

In [17]:
# target the source attributes

image_source = []

for image in all_images:
    image = image.get("src")
    image_source.append(image)
    
image_source

['/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRIVArSGPi8MDZitp_uc79ptLhg6lA2Lc6p8fMGppGELDDvzEPr3y6CuRzhyfI&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTeaFYJjeoc3Fm1TR-2SUxU3PHeb4eLc3vXLvOM5ZbkAasQslENAX4_dCVNzmE&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTrgB9nlN2zZiXgV-My4UeS6p42X74gxgomeHOl0nfxqLmMFmloqxQ5dQ0B2-s&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRWXOQ2Bf50RLW6UWigQpuVIFTIGMKnLfRVITQmTwT1IIJTfheOTXGDlZCBV6M&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTomST8obcYY0gzGOhboi10kF9dGo-9qx0d277S8KUbh5tul_tXQnAT-WQGXxM&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTVPrychlgIgyR8RXjYHuSQqvHUFhmR9aihIw6os8X8fQbinFb0z6ZoD-x_oQ&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTTvy_zbUO2tAJhLX28DKyN6M0NRxBI0fhXDUJEP5e0qz0_yPGJJ6R2khR1k1A&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSoAvzivcyaoI

In [18]:
image_source = [image for image in image_source if image.startswith("https")]
image_source

['https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRIVArSGPi8MDZitp_uc79ptLhg6lA2Lc6p8fMGppGELDDvzEPr3y6CuRzhyfI&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTeaFYJjeoc3Fm1TR-2SUxU3PHeb4eLc3vXLvOM5ZbkAasQslENAX4_dCVNzmE&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTrgB9nlN2zZiXgV-My4UeS6p42X74gxgomeHOl0nfxqLmMFmloqxQ5dQ0B2-s&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRWXOQ2Bf50RLW6UWigQpuVIFTIGMKnLfRVITQmTwT1IIJTfheOTXGDlZCBV6M&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTomST8obcYY0gzGOhboi10kF9dGo-9qx0d277S8KUbh5tul_tXQnAT-WQGXxM&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTVPrychlgIgyR8RXjYHuSQqvHUFhmR9aihIw6os8X8fQbinFb0z6ZoD-x_oQ&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTTvy_zbUO2tAJhLX28DKyN6M0NRxBI0fhXDUJEP5e0qz0_yPGJJ6R2khR1k1A&s',
 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSoAvzivcyaoIE9-zzmhHLHzCKSdpjvtnOF5n_dO2cIu5U7iRPoEZ9XWJ8mBSE&s',
 'https://encrypted-tbn0.

In [19]:
import os
import wget

path = os.getcwd()
path = os.path.join(path, search_term+"s")

#create directory
os.mkdir(path)

path

'C:\\Users\\Annisa Nurdiana\\Documents\\Data D\\Algoritma\\1Class\\Projects\\web_scrapp\\offices'

In [20]:
# download images

counter = 0

for image in image_source:
    save_as = os.path.join(path, search_term+str(counter)+".png")
    wget.download(image, save_as)
    counter += 1

100% [................................................................................] 5488 / 5488