# ETL Project
----

### Yelp Web Scraping

In this notebook are included the following tasks:

* __Extract__

   Search the `Coffee and Tea businesses near Mexico City area` in Yelp.com and using web scraping, got business name and location from each HTML page.


* __Transform__

   Using Python, Splinter in Chrome, and BeautifulSoup got nearly 1,000 business.


* __Load__ 
   
   The dataset was stored permanently in a Mongo cluster.

---


In [None]:
# Depemdencies
from splinter import Browser
from bs4 import BeautifulSoup
from pymongo import MongoClient
import time

In [2]:
# Splinter for Chrome
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [62]:
# URL of page to be scraped
url = 'https://www.yelp.com/search?find_desc=Coffee+%26+Tea&find_loc=Mexico+City%2C+D.F.%2C+Mexico&ns=1'
browser.visit(url)

In [46]:
# Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# Examine the results, then determine element that contains sought info
#print(soup.prettify())

In [47]:
# Get the number of results
total = soup.find_all('p', class_='lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--right__373c0__3ARv7')         
for x in total:
    number = x.text.strip()
    print(number)

Showing 1-30 of 1220


In [63]:
# Calculate the number of pages we have to visit
pages = round(int(number.split('-')[1].split('of')[1]) / int(number.split('-')[1].split('of')[0]),0)
pages

41.0

In [64]:
# Get the information from each page 
next_link_xpath_p1 = '//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[10]/a/div/span'
next_link_xpath_ps = '//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span'

unit_list = []
# For each page retrieve the information
for x in range(int(pages)):
    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all div page elements with the class asociated with a result
    elements = soup.find_all('div', class_='lemon--div__373c0__1mboc u-padding-t3 u-padding-b3 border--top__373c0__19Owr border-color--default__373c0__2oFDT')
    for element in elements:
        h3 = element.find('h3')
        address_s = element.find('address')
        if address_s != None:
            address = address_s.text.strip()
        else:
            address = 'StreetNA'
        div = element.find('div', class_='lemon--div__373c0__1mboc u-space-b1 border-color--default__373c0__2oFDT')
        neigh_s = div.div
        if neigh_s != None:
            neigh = neigh_s.text.strip()
        else:
            neigh = 'NeighNA'
        unit = str(h3.text.strip()) + ";" + str(address) + ";" + str(neigh) + ";" + str(element.h3.a['name']) + ";" + str(element.h3.a['href'])
        if unit not in unit_list:
            unit_list.append(unit)
            
    # Click the 'Next' button on each page by xpath to avoid exceptions in Chrome
    try:
        # Code that might cause an exception
        if x == 0:
            browser.find_by_xpath(next_link_xpath_p1)[0].click() 
        else:
            browser.find_by_xpath(next_link_xpath_ps)[0].click() 

         # Save the HTML page to a file for later verification
        file_name = "../../Yelp/page_" + str(x + 1) + ".html"
        with open(file_name, "w") as f:
            f.write(str(soup.encode("utf-8")))

    except Exception as e:
        # Code to be ejecuted in case an exception arises
        print("Exception - Page "+ str(x + 1) + " - " + str(e) + "\n")

print("Scraping Complete\n")


Exception - Page 3 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 20 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 30 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 37 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 38 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 39 - no elements could be found with xpath "//*[@id="wrap"]/div[3]/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div/div[2]/div/div[11]/a/div/span"

Exception - Page 40 - n

In [65]:
# View the total
print(len(unit_list))

990


In [66]:
# View the results
for unit in unit_list:
    print(unit)

1. Chiquitito Café;Alfonso Reyes 232;Condesa;Chiquitito Café;/biz/chiquitito-caf%C3%A9-cuidad-de-mexico?osq=Coffee+%26+Tea
2. Tomás;Tamaulipas 66;Condesa;Tomás;/biz/tom%C3%A1s-m%C3%A9xico-2?osq=Coffee+%26+Tea
3. Café Avellaneda;Higuera 40;Coyoacán;Café Avellaneda;/biz/caf%C3%A9-avellaneda-m%C3%A9xico?osq=Coffee+%26+Tea
4. Blend Station;Av. Tamaulipas 60;Condesa;Blend Station;/biz/blend-station-ciudad-de-m%C3%A9xico?osq=Coffee+%26+Tea
5. Buna;Orizaba 42;Roma Norte;Buna;/biz/buna-ciudad-de-m%C3%A9xico-2?osq=Coffee+%26+Tea
6. Cardinal;Calle Córdoba 132;Roma Norte;Cardinal;/biz/cardinal-m%C3%A9xico?osq=Coffee+%26+Tea
7. The Black Rabbit;Xola 1603;Narvarte;The Black Rabbit;/biz/the-black-rabbit-ciudad-de-m%C3%A9xico-2?osq=Coffee+%26+Tea
8. Salem Witch Store & Coffee;Diagonal San Antonio 1747;Narvarte;Salem Witch Store & Coffee;/biz/salem-witch-store-y-coffee-m%C3%A9xico?osq=Coffee+%26+Tea
9. Enhorabuena Café;Calle Atlixco 13;Condesa;Enhorabuena Café;/biz/enhorabuena-caf%C3%A9-ciudad-de-m%C3

In [69]:
# Initialize PyMongo to work with MongoDBs

# Caution. The following line was modified to avoid publishing the password in GitHub, please see note in the Boot Camp Spot
_mongoClusterURI = ''

dbClient = MongoClient(_mongoClusterURI)
db = dbClient["etl_project"]

# Get the information from the list and store it in mongo
for unit in unit_list:
    sequence, street, neigh, name, link = unit.split(";")
    db.cafesyelp.insert_one({'name': name.strip(), 'street': street.strip(), 'neighborhood': neigh.strip(), 'sequence': sequence.replace('\xa0',' ').strip(), 'link': link.strip()})

print("The insertions in the database are finished")

The insertions in the database are finished


In [70]:
# Close the browser after the first scraping
browser.quit()

# Notes

At this point, we had __two problems__ to solve:

- Yelp showed that the search results were equal to 1,220 records, however after the record 990 the `Next` link was not working properly, it showed a message that Yelp could not find more results. After trying manual requests to retrieve the records from 991 to 1,220 without success, a decision to keep only 990 records was made.

     See [yelp_error](yelp_error.png)


- Some records did not have the neighborhood or this information was in the field `street`, therefore was necessary to make some additional requests to Yelp to try to complete the information. Unfortunately, after the record 93th, Yelp blocked the access and the process could not be completed. A decision to finish the assignment with the information gathered so far, and not trying to run the process in another computer was made. 

     See [yelp_lock](yelp_lock.png)

In [86]:
# Splinter for Chrome for the second web scraping
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [85]:
# Find records without neighborhood using the field link storec in each document
base_url = 'https://www.yelp.com'
i = 0
neigh_list = []

cafes = db.cafesyelp.find({"neighborhood": "NeighNA"})
for cafe in cafes:
    i += 1

    # URL of page to be scraped
    biz = cafe['link']
    url = base_url + biz
    # Code that might cause an exception
    browser.visit(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
        
    # Find the element that contains the address
    address = soup.find('div', class_='map-box-address u-space-l4')
    if address != None: 
        neigh = str(cafe['name']) + ";" + str(address) + ";" + str(address.text.strip()) 
        if neigh not in neighs_list:
            neigh_list.append(neigh)
            
print("Scraping Complete\n")

Scraping Complete



In [None]:
# Close the browser after the second scraping
browser.quit()

In [82]:
# View the results that could be obtained via individual requests
for unit in neigh_list:
    print(unit)

El Minichelista;<div class="map-box-address u-space-l4">
<strong class="street-address">
<address>
        Guanábana 197<br/>Col. Nueva Santa María<br/>02800 México, D.F.<br/>Mexico
    </address>
</strong>
</div>

Café Quintal;<div class="map-box-address u-space-l4">
<strong class="street-address">
<address>
        Av. Lic. Benito Juárez 10<br/>54600 Tepotzotlán, México<br/>Mexico
    </address>
</strong>
</div>

Cafeleería;<div class="map-box-address u-space-l4">
<strong class="street-address">
<address>
        Taxqueña 1832 local b<br/>Col. San Francisco Culhuacán<br/>04470 México, D.F.<br/>Mexico
    </address>
</strong>
</div>

Café Bolshoi;<div class="map-box-address u-space-l4">
<strong class="street-address">
<address>
        Manuel M. Ponce  349<br/>Col. Guadalupe Inn<br/>01020 México, D.F.<br/>Mexico
    </address>
</strong>
</div>

Café Paradiso;<div class="map-box-address u-space-l4">
<strong class="street-address">
<address>
        Miguel Othon de Mendizabal 343 Plaza 

In [131]:
# Caution. The following line was modified to avoid publishing the password in GitHub, please see note in the Boot Camp Spot
#_mongoClusterURI = ''

dbClient = MongoClient(_mongoClusterURI)
db = dbClient["etl_project"]

In [151]:
# Create a temporal list after the block from Yelp to try to update the maximum number of records
mylist = [['48. El Minichelista', 'El Minichelista','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Guanábana 197<br/>Col. Nueva Santa María<br/>02800 México, D.F.<br/>Mexico</address></strong></div>'],
['64. Café Quintal', 'Café Quintal','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Lic. Benito Juárez 10<br/>54600 Tepotzotlán, México<br/>Mexico</address></strong></div>'],
['79. Cafeleería', 'Cafeleería','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Taxqueña 1832 local b<br/>Col. San Francisco Culhuacán<br/>04470 México, D.F.<br/>Mexico</address></strong></div>'],
['86. Café Bolshoi', 'Café Bolshoi','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Manuel M. Ponce349<br/>Col. Guadalupe Inn<br/>01020 México, D.F.<br/>Mexico</address></strong></div>'],
['92. Café Paradiso', 'Café Paradiso','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Miguel Othon de Mendizabal 343 Plaza Torres Lindavista<br/>Col. Lindavista<br/>07700 México, D.F.<br/>Mexico</address></strong></div>'],
['97. Mextli Xocolatl', 'Mextli Xocolatl','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calzada De La Viga 862<br/>Col. Héroes de Churubusco<br/>09090 México, D.F.<br/>Mexico</address></strong></div>'],
['98. Café del Pueblo', 'Café del Pueblo','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Ignacio Zaragoza 19<br/>62755 Tepoztlán, Morelos<br/>Mexico</address></strong></div>'],
['116. Cassava Roots', 'Cassava Roots','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Colector 13<br/>Col. Lindavista<br/>07820 México, D.F.<br/>Mexico</address></strong></div>'],
['122. El Momentito', 'El Momentito','<div class="map-box-address u-space-l4"><strong class="street-address"><address>República de Brasil 900<br/>Col. Americas<br/>50130 Toluca de Lerdo, México<br/>Mexico</address></strong></div>'],
['124. El Anaquel', 'El Anaquel','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Hda. de Sierra Vieja S/N<br/>Cuautitlán Izcalli, México<br/>Mexico</address></strong></div>'],
['151. Isabeau', 'Isabeau','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Eduardo Molina<br/>Col. Morelos<br/>15300 México, D.F.<br/>Mexico</address></strong></div>'],
['159. Café Abbey Road', 'Café Abbey Road','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Roberto Fulton S/N<br/>54047 Tlanepantla, México<br/>Mexico</address></strong></div>'],
['160. Cassava Roots', 'Cassava Roots','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Interlomas 5 Local U-03 PA<br/>Centro Comercial Paseo Interlomas<br/>52760 Huixquilucan de Degollado, México<br/>Mexico</address></strong></div>'],
['161. Latte Art Coffee', 'Latte Art Coffee','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Sur 12 274<br/>08500 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['169. Je t’aime Cafe', 'Je t’aime Cafe','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Felipe Villanueva 97<br/>01020 Ciudad de México, Aguascalientes<br/>Mexico</address></strong>/div>'],
['172. Sweet Berry', 'Sweet Berry','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Arenal 651<br/>16020 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['182. Pasión del Cielo Coffee', 'Pasión del Cielo Coffee','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Manuel Ávila Camacho 100<br/>Col. La Florida<br/>Local 1<br/>53160 Naucalpan de Juárez, México<br/>Mexico </address></strong></div>'],
['195. Jemekir Café', 'Jemekir Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Isabel la Católica 88 A - B<br/>Col. Centro<br/>México, D.F.<br/>Mexico</address></strong></div>'],
['196. Pasion del Cielo', 'Pasion del Cielo','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Aeropuerto Internacional Benito Juárez Ciudad de México<br/>Terminal 1<br/>Col. Peñon de los Baños<br/>15620 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['202. Punta del Cielo', 'Punta del Cielo','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Díaz Ordaz 50<br/>Col. Acapatzingo<br/>62440 Cuernavaca, Morelos<br/>Mexico</address></strong></div>'],
['206. El Rincón de los Morales', 'El Rincón de los Morales','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av Sor Juana Inés de La Cruz 112<br/>Col. Tlalnepantla Centro<br/>54000 Tlalnepantla de Baz, México<br/>Mexico</address></strong></div>'],
['212. The Italian Coffee Company', 'The Italian Coffee Company','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Autopista México Cuernavaca km. 49<br/>Col. Fierro del Toro<br/>62514 Huitzilac, Morelos<br/>Mexico</address></strong></div>'],
['213. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Alberto Santos Dumont 209<br/>Col. Adolfo López Mateos<br/>15740 México, D.F.<br/>Mexico</address></strong></div>'],
['217. St. Patrick Coffee', 'St. Patrick Coffee','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Adriano Brower 60 y Guido Reni<br/>Col. Sacramento<br/>01460 México, D.F.<br/>Mexico</address></strong></div>'],
['219. Cafe No 1', 'Cafe No 1','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Lic. Rafael Reyes Espindola 18<br/>Col. Periodista<br/>11220 México, D.F.<br/>Mexico</address></strong></div>'],
['220. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Carretera Nacional México-Puebla Km 30.3<br/>56577 Ixtapaluca, México<br/>Mexico</address></strong></div>'],
['229. City Market', 'City Market','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Lago Zurich 215<br/>11529 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['230. Murmullos Café', 'Murmullos Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>José María Velasco4 -E<br/>Col San José Insurgentes<br/>03900 México, D.F.<br/>Mexico</address></strong></div>'],
['231. Cafe Voltaire', 'Cafe Voltaire','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Trojes 45<br/>Col. Minerva<br/>09810 México, D.F.<br/>Mexico</address></strong></div>'],
['234. Bizcochitos Cafetzzería', 'Bizcochitos Cafetzzería','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Desierto de los leones 4991<br/>Col. Tetelpan<br/>01700 México, D.F.<br/>Mexico</address></strong></div>'],
['246. El Chalet', 'El Chalet','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Solidaridad Las Torres<br/>52172 Metepec, México<br/>52172 Metepec, México<br/>Mexico</address></strong></div>'],
['255. El Mito Café', 'El Mito Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Eduardo Molina 165 20 de Noviembre<br/>15300 México, D.F.<br/>Mexico</address></strong></div>'],
['260. Café 8', 'Café 8','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Alcanfores y San Juan Totoltepec<br/>53150 Naucalpan de Juárez, México<br/>Mexico</address></strong></div>'],
['261. Las Crepas Bosques', 'Las Crepas Bosques','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Bosques de Argelia 6<br/>Col.Bosques de Aragón<br/>57170 Nezahualcóyotl, México<br/>Mexico</address></strong></div>'],
['267. La Estación Clavería', 'La Estación Clavería','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Clavería 187 esquina Maravatío<br/>Col Claveria<br/>02080 México, D.F.<br/>Mexico</address></strong></div>'],
['269. Coffee Tree', 'Coffee Tree','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Lago Zurich 245,<br/>Col. Ampliación Granada.<br/>11529 México, D.F.<br/>Mexico</address></strong></div>'],
['270. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>La Cuspide Sky Mall<br/>Av. Lomas Verdes 1200<br/>53124 Naucalpan de Juárez, México<br/>Mexico</address></strong></div>'],
['271. Café Línea Dorada', 'Café Línea Dorada','<div class="map-box-address u-space-l4"><strong class="street-address"><address>AV. Ermita Iztapalapa<br/>Col. Cacama<br/>09080 México, D.F.<br/>Mexico</address></strong></div>'],
['276. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Miguel Ángel de Quevedo 279-287<br/>04010 México, D.F.<br/>Mexico</address></strong></div>'],
['285. El Tapanco', 'El Tapanco','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle 3 Sur 337<br/>Col. Juan Fernandez Albarran<br/>52169 Metepec, México<br/>Mexico</address></strong></div>'],
['287. El Reloj', 'El Reloj','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Tláhuac 2909<br/>Col. Zapotitlan, Tlahuac<br/>04640 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['288. Café del Árbol', 'Café del Árbol','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calz. de las Aguilas 453<br/>Col. Las Águilas<br/>01710 México, D.F.<br/>Mexico</address></strong></div>'],
['308. Mamva', 'Mamva','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Miguel de Cervantes Saavedra 301<br/>11529 México, D.F.<br/>Mexico</address></strong></div>'],
['313. Piccolo Angolo del Cuore', 'Piccolo Angolo del Cuore','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Popocatépetl 88<br/>Fracc. Los Pirules<br/>54040 Tlalnepantla de Baz, México<br/>Mexico</address></strong></div>'],
['314. Bamba café', 'Bamba café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Rosa María Sequeira s/n<br/>Presidentes Ejidales<br/>04488 México, D.F.<br/>Mexico</address></strong></div>'],
['317. La Kreperia', 'La Kreperia','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Clavelinas 186<br/>02800 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['331. Cafe con Leché', 'Cafe con Leché','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Puerto Aéreo 151<br/>Col. Federal del Venustiano Carranza<br/>15700 México, D.F.<br/>Mexico</address></strong></div>'],
['339. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Toluca 186<br/>01780 México, D.F.<br/>Mexico</address></strong></div>'],
['342. Boschetti Café', 'Boschetti Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calzada de los Jinetes 14<br/>Col. Valle Dorado<br/>54020 Tlalnepantla, México<br/>Mexico</address></strong></div>'],
['343. La Galeria coffe & lasagna', 'La Galeria coffe & lasagna','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Valle de Zapatas 36<br/>Col. Valle de Aragón 1ra sección<br/>57100 Ciudad Nezahualcóyotl, México<br/>Mexico</address></strong></div>'],
['348. Cielito Querido', 'Cielito Querido','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Galerías Toluca<br/>50130 Toluca, México<br/>Mexico</address></strong></div>'],
['353. El Café', 'El Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Puerto Cozumel 86<br/>Col. Héroes de Chapultepec<br/>07930 México, D.F.<br/>Mexico</address></strong></div>'],
['354. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calz. San Juan de Aragón 516<br/>Col. Gustavo A. Madero<br/>07455 Gustavo A Madero, CDMX<br/>Mexico</address></strong></div>'],
['355. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Teotihuacan 18<br/>54750 Cuautitlán Izcalli, México<br/>Mexico</address></strong></div>'],
['356. El', 'El','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Ote. 158#39 Col. Moctezuma 2a. Sección<br/>15530 México, D.F.<br/>Mexico</address></strong></div>'],
['357. Calipso Cofee Break', 'Calipso Cofee Break','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Lopez Portillo<br/>Col. Coacalco<br/>55700 San Francisco Coacalco, México<br/>Mexico</address></strong></div>'],
['358. Cassava Roots', 'Cassava Roots','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Centro Comercial Parque Tezontle<br/>Canal de Tezontle 1512<br/>Col. Alfonso Ortiz Tirado<br/>08020 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['371. Expendio de Café Galván', 'Expendio de Café Galván','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Arroyo de Zacatenco 2<br/>Col. Residencial La Escalera<br/>Local 1<br/>07320 Gustavo A Madero, CDMX<br/>Mexico</address></strong></div>'],
['377. Churrolandia', 'Churrolandia','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Toluca 1 esq. Calimaya<br/>Col. Cumbría, Cuautitlán Izcalli<br/>54740 Cuautitlan Izcalli, México<br/>Mexico</address></strong></div>'],
['378. Coffe Shop', 'Coffe Shop','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Nezahualcoyotl esquina Francisco I. Madero<br/>Col. Barrio San Miguel<br/>08610 México, D.F.<br/>Mexico</address></strong></div>'],
['384. Antojitos Lupita', 'Antojitos Lupita','<div class="map-box-address u-space-l4"><strong class="street-address"><address>local 227 Mercado de Jamaica<br/>Av Morelos Esq. Congreso deCol. Jamaica Del. Venustiano Carranza<br/>15800 México, D.F.<br/>Mexico</address></strong></div>'],
['388. Coffee Land Express', 'Coffee Land Express','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Ignacio Commonfort 59<br/>02719 México, D.F.<br/>Mexico</address></strong></div>'],
['392. Lar del Pan de Queso', 'Lar del Pan de Queso','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Patriotismo 604<br/>Col.Nonoalco Mixcoac<br/>03700 México, D.F.<br/>Mexico</address></strong>/div>'],
['396. Hantis Café', 'Hantis Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Juarez Pte. 107<br/>Col. Centro<br/>74000 San Martin Texmelucan de Labastida, Puebla<br/>Mexico</address></strong></div>'],
['398. Cafe Cuitlahuac', 'Cafe Cuitlahuac','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Cuitlahuac 1140 Loc. B<br/>Col. Del Gas<br/>02900 México, D.F.<br/>Mexico</address></strong></div>'],
['402. Vida Nuez', 'Vida Nuez','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Insurgentes Sur 1971<br/>01020 México, D.F.<br/>Mexico</address></strong></div>'],
['404. El Mundo del Café', 'El Mundo del Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Insurgentes Sur 2340<br/>14630 México, D.F.<br/>Mexico</address></strong></div>'],
['408. Dotcom Café', 'Dotcom Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Sonora 4<br/>15620 Ciudad de México, CDMX<br/>Mexico</address></div>'],
['411. El oro de Cuetzalan', 'El oro de Cuetzalan','<div class="map-box-address u-space-l4"><strong class="street-address"><address>A. 5 de mayo<br/>Santiago Tulyehualco<br/>16700 México, D.F.<br/>Mexico</address></strong></div>'],
['414. Paraíso Store Café', 'Paraíso Store Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. de las Naciones 39-C<br/>Col. Bosques de Aragón<br/>76080 Mexico, México<br/>Mexico</address></strong></div>'],
['417. Starbucks', 'Starbucks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. 510 340<br/>Col. Aragon<br/>07950 Ciudad de México, CDMX<br/>Mexico</address></strong></div>'],
['418. Pan y café', 'Pan y café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Chosica 673<br/>Col. Lindavista<br/>07300 México, D.F.<br/>Mexico</address></strong></div>'],
['420. Fondants Pasteleria', 'Fondants Pasteleria','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Matamaros 236<br/>14250 México, D.F.<br/>Mexico</address></strong></div>'],
['422. The Italian Coffee Company', 'The Italian Coffee Company','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Carretera Ciudad de México - Cuernavaca 95D S/N<br/>14650 México, D.F.<br/>Mexico</address></strong></div>'],
['425. La Libélula Café', 'La Libélula Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Norte 48 3803<br/>Col. Emiliano Zapata<br/>07889 México, D.F.<br/>Mexico</address></strong></div>'],
['426. Heladería Santa Clara', 'Heladería Santa Clara','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Felix Parra 165<br/>Col. San José Insurgentes<br/>03900 México, D.F.<br/>Mexico</address></strong></div>'],
['427. Jar’O Café', 'Jar’O Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Hidalgo Ote 304<br/>Col. Centro<br/>52400 Tenancingo de Degollado, México<br/>Mexico</address></strong></div>'],
['428. Tender’s Coffee', 'Más Arte','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Manuel M. Ponce 223<br/>Col. Guadalupe Inn<br/>01020 México, D.F.<br/>Mexico</address></strong></div>'],
['443. Arroz con Leche', 'Arroz con Leche','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Itzcóatl 58<br/>Col. Tlaxpana<br/>11370 México, D.F.<br/>Mexico</address></strong></div>'],
['446. Fresh Cup Con Sabor', 'Fresh Cup Con Sabor','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Gutemberg 114<br/>62000 Cuernavaca, Morelos<br/>Mexico</address></strong></div>'],
['448. Le Pain Quotidien', 'Le Pain Quotidien','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Calzada de los Leones 145<br/>01710 Ciudad de México, CDMX<br/>Mexico</address></div>'],
['451. Cuadrante Coffee Shop', 'Cuadrante Coffee Shop','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Eje Norte Sur<br/>62578 Jiutepec, Morelos<br/>Mexico</address></strong></div>'],
['452. Sato & Café', 'Sato & Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. 625 159-AesquinaAv. 606<br/>Col. San Juan de Aragón<br/>07979 México, D.F.<br/>Mexico</address></strong></div>'],
['465. Kftria 4', 'Kftria 4','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Instituto Politecnico Nacional<br/>Col. Lindavista<br/>07300 México, D.F.<br/>Mexico</address></strong></div>'],
['466. Café El Vitral', 'Café El Vitral','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Laguna Del Carmen 127<br/>Col. Anáhuac<br/>11320 México, D.F.<br/>Mexico</address></strong></div>'],
['474. Café Ethnó', 'Café Ethnó','<div class="map-box-address u-space-l4"><strong class="street-address"><address>613 n.106 San Juan de Aragón<br/>07979 México, D.F.<br/>Mexico</address></strong></div>'],
['477. Starbuks', 'Starbuks','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Gran Sur S/N<br/>Col. el Caracol<br/>México, D.F.<br/>Mexico</address></strong></div>'],
['482. Carabina 30-30', 'Carabina 30-30','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Escuinapa 11<br/>Col. Pedregal de Santo Domingo<br/>04369 Coyoacán, CDMX<br/>Mexico</address></strong></div>'],
['484. Juan Valdez Café', 'Juan Valdez Café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Capitán Carlos León S/NAeropuerto T1<br/>Col. Peñón de los Baños<br/>15520 México, D.F.<br/>Mexico</address></strong></div>'],
['490. Ascendenza', 'Ascendenza','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Adolfo López Mateos Locales201<br/>Col. Santa Cruz Acatlan<br/>53150 Naucalpan de Juárez, México<br/>Mexico</address></strong></div>'],
['494. Camelot Cafetería', 'Camelot Cafetería','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Paseo de las Haciendas S/N<br/>Col.Conj U los Sauces IV<br/>Manzana 3<br/>50210 San José Guadalupe, México<br/>Mexico</address></strong></div>'],
['507. Garat café', 'Garat café','<div class="map-box-address u-space-l4"><strong class="street-address"><address>Miguel de Cervantes Saavedra 303<br/>Col. Granada<br/>11529 México, D.F.<br/>Mexico</address></strong></div>'],
['510. Deli Café', 'Deli Café', '<div class="map-box-address u-space-l4"><strong class="street-address"><address>Insurgentes 1658<br/>Col. Florida<br/>México, D.F.<br/>Mexico</address></strong>/div>']]

In [109]:
# View the number of records that could be retrieved before the blocking
len(mylist)

93

In [152]:
# Update records wiih temporal list
for unit in mylist:
    # Find the substring Col. to set the beginning
    bpos_neigh = unit[2].find('<br/>Col.') 
    if bpos_neigh > 0:
        # Find the substring <br> to set the final position
        fpos_neigh = unit[2].find('<br/>', bpos_neigh + 9)
        if fpos_neigh > 0 :
            # Extract only the neighborhood
            neigh = unit[2][bpos_neigh + 9 : fpos_neigh]
            print("1 " + str(unit[2]) + "-" + str(bpos_neigh) + "-" + str(fpos_neigh) + "-" + ":" + neigh) 
            # Update record
            db.cafesyelp.update_one( { "sequence" : unit[0] } , { "$set": {"neighborhood":neigh.strip() } })
        else:
            print("2 " + str(unit[2]))
    else:
        # Find the substring <br> to set the beginning
        bpos_neigh = unit[2].find('<br/>') 
        # Find the substring <br> to set the final position
        fpos_neigh = unit[2].find('<br/>', bpos_neigh + 5)
        if fpos_neigh > 0 :
            # Extract only the neighborhood
            neigh = unit[2][bpos_neigh + 5 : fpos_neigh]
            print("3 " + str(unit[2]) + "-" + str(bpos_neigh) + "-" + str(fpos_neigh) + "-" + ":" + neigh) 
            # Update record
            db.cafesyelp.update_one( { "sequence" : unit[0] } , { "$set": {"neighborhood":neigh.strip() } })
            

1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Guanábana 197<br/>Col. Nueva Santa María<br/>02800 México, D.F.<br/>Mexico</address></strong></div>-93-120-: Nueva Santa María
3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Lic. Benito Juárez 10<br/>54600 Tepotzotlán, México<br/>Mexico</address></strong></div>-105-135-:54600 Tepotzotlán, México
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Taxqueña 1832 local b<br/>Col. San Francisco Culhuacán<br/>04470 México, D.F.<br/>Mexico</address></strong></div>-101-134-: San Francisco Culhuacán
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Manuel M. Ponce349<br/>Col. Guadalupe Inn<br/>01020 México, D.F.<br/>Mexico</address></strong></div>-98-121-: Guadalupe Inn
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Miguel Othon de Mendizabal 343 Plaza Torres Lindavista<br/>Col. 

3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Miguel Ángel de Quevedo 279-287<br/>04010 México, D.F.<br/>Mexico</address></strong></div>-115-138-:04010 México, D.F.
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle 3 Sur 337<br/>Col. Juan Fernandez Albarran<br/>52169 Metepec, México<br/>Mexico</address></strong></div>-95-128-: Juan Fernandez Albarran
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Av. Tláhuac 2909<br/>Col. Zapotitlan, Tlahuac<br/>04640 Ciudad de México, CDMX<br/>Mexico</address></strong></div>-96-125-: Zapotitlan, Tlahuac
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Calz. de las Aguilas 453<br/>Col. Las Águilas<br/>01710 México, D.F.<br/>Mexico</address></strong></div>-104-125-: Las Águilas
3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Blvd. Miguel de Cervantes Saavedra 301<br/>11529 Méx

1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Manuel M. Ponce 223<br/>Col. Guadalupe Inn<br/>01020 México, D.F.<br/>Mexico</address></strong></div>-99-122-: Guadalupe Inn
1 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Itzcóatl 58<br/>Col. Tlaxpana<br/>11370 México, D.F.<br/>Mexico</address></strong></div>-91-109-: Tlaxpana
3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Calle Gutemberg 114<br/>62000 Cuernavaca, Morelos<br/>Mexico</address></strong></div>-99-129-:62000 Cuernavaca, Morelos
3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Calzada de los Leones 145<br/>01710 Ciudad de México, CDMX<br/>Mexico</address></div>-105-138-:01710 Ciudad de México, CDMX
3 <div class="map-box-address u-space-l4"><strong class="street-address"><address>Eje Norte Sur<br/>62578 Jiutepec, Morelos<br/>Mexico</address></strong></div>-93-121-:62578 Jiutepec, Morelos
1 <d

In [156]:
# See hoy many records have neighborhood
i = 0
results = db.cafesyelp.find({"neighborhood": "NeighNA"})
for result in results:
    i += 1

print("Documents with neighborhood: " + str(990 - i))

Documents with neighborhood: 858
