In [100]:
import re
import requests
from bs4 import BeautifulSoup
import unidecode
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from tqdm import tqdm
from tqdm import trange
import time
from slugify import slugify

# Getting Data

Each establishment will initially consist of following fields:
- id
- name
- description
- phone
- website
- hours
- category
- address
- zip code
- city
- country
- latitude
- longitude

### Yellow Pages

In [89]:
def get_yellow_info(url,category):
    
    names=[]
    city=[]
    address=[]
    phone=[]
    web=[]
    description=[]
    
    response = requests.get(url)
    
    #get soup
    soup = BeautifulSoup(response.text,'lxml')
    
    #getting main box
    general = soup.find_all('div', itemtype="https://schema.org/LocalBusiness")

    
    if len(general)>0:
        for i in range(len(general)):

            #NAMES
            try:
                names.append(general[i].find('a', class_='companyName').text.split('\n')[1].strip())
            except:
                names.append(np.nan)

           #CITY
            try:
                city.append(general[i].find('span', class_='city').text.strip())
            except:
                city.append(np.nan)

            #Address
            try:
                address.append(general[i].find('span', class_='directionFig').text.split('\n')[1].strip())
            except:
                address.append(np.nan)


            #phone
            try:
                phone.append(general[i].find('span', class_='phoneFig hide').text.strip())
            except:
                phone.append(np.nan)

            #web
            try:
                web.append(general[i].find('a', class_='webLink').text.strip())

            except:
                web.append(np.nan)

            #description
            try:
                description.append(general[i].find('div', class_='col-sm-12 infoBox').text.strip())
            except:
                description.append(np.nan)
                
        
        #creating dataframe
        df = pd.DataFrame(zip(names, city, address,phone,web,description),
                              columns=['names','city','address','phone','web','description'])
        
        df['category'] = category
        
        return df
                
            
    else:
        return print('no se pudo, response:', response.status_code)

In [90]:
lista_url = [
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=1',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=2',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=3',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones-deportivas',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/academias-de-ballet', 
         'ballet'],
    
        ['https://www.paginasamarillas.com.gt/servicios/bolos',
         'boliche'],
    
        ['https://www.paginasamarillas.com.gt/servicios/artes-marciales-ensenanza-de',
         'artes marciales'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=1',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=2',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=3',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=4',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/entrenamiento-fisico',
         'entrenamiento fisico'],
        
        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-artes-marciales',
         'artes marciales'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-karate',
         'artes marciales'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-natacion',
         'natacion'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-vuelo-libre',
         'aviacion'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-aviacion',
         'aviacion'],

        ['https://www.paginasamarillas.com.gt/servicios/golf',
         'golf'],

        ['https://www.paginasamarillas.com.gt/servicios/hipodromos',
         'hipodromos']
        ]

In [91]:
yellow = pd.DataFrame()
for url in lista_url:
    bucle = get_yellow_info(url[0],url[1])
    yellow = pd.concat([yellow,bucle],ignore_index=True)

In [None]:
yellow.head()

### Dance Academys

In [12]:
url = 'https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html'

In [13]:
def baile(url,category,web):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    names = [soup.find('h1').text]
    city = ['Guatemala - Guatemala']
    address = [soup.find('p',class_='address').text]
    phone = [soup.find('div',class_='schedule-box span_mobile12 span_tablet3 span3').text.strip().split('\n')[1]]
    description = [soup.find('div',class_='single-content').p.text.strip()]
    category = [category]
    web = [web]
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])
    
    

In [14]:
lista_url = [
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html',
         'Baile','https://es-la.facebook.com/ScenicDance/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/salsa-latin-guatemala.html',
        'Baile','https://www.facebook.com/Salsa-Latin-Guatemala-178268719078/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-dance-art.html',
         'Baile','https://es-la.facebook.com/Danceartguatemala/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-bellydance-rashida.html',
        'Baile', 'https://es-la.facebook.com/Rashidaacademy/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/addiction-dance-studio.html',
        'Baile','https://es-la.facebook.com/adsgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-cubana-de-baile-.html',
        'Baile','https://es-la.facebook.com/Escuela-Cubana-de-Baile-110562958135/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/in-motion-dance-fitness-guatemala.html',
        'Baile','http://www.inmotiongt.com/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-de-ballet-danzarte.html',
        'Baile','https://es-la.facebook.com/Danzarte.ballet/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/ritmo-y-sabor-guatemala.html',
        'Baile','https://es-la.facebook.com/ritmoysaborgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-municipal-de-danza-clasica.html',
        'Baile','https://es-la.facebook.com/EMDCgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/heroes-academy-guatemala-.html',
        'Baile','https://es-la.facebook.com/HeroesCompanyGT/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unidanza-guatemala-.html',
        'Baile','http://unidanza.net/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unlimited-dance-academy.html',
        'Baile','https://unlimited-dance.com.gt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/the-dance-factory.html',
        'Baile','https://es-la.facebook.com/DanceFactoryGT/']
    

        ]

In [15]:
df=pd.DataFrame()
for url in lista_url:
    bucle = baile(url[0],url[1],url[2])
    df2 = pd.concat([df2,bucle],ignore_index=True)

In [16]:
df2.head()

Unnamed: 0,names,city,address,phone,web,description,categoría,category
0,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales,
1,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales,
2,Schumann'S Cordova,Guatemala - Guatemala,Blvd San Cristobal Centro Comercial San Cristo...,(+502) 2478 1097,http://www.schumanscordova.com,,artes marciales,
3,"Empresas De Competencia, Sociedad Anonima",Guatemala - Guatemala,Boulevard Rafael Landivar Paseo Cayala L 209,(+502) 2493 8000,,,artes marciales,
4,"Grupo Aquatic, S.A.",Guatemala - Guatemala,"Km 17.5 Boulevard San José Villa Nueva, Frente...",(+502) 2312 0808,www.aquaticcenterguate.com,"Clases de natación para bebés, niños y adultos...",natacion,


### Crossfit

In [17]:
url = 'https://www.guatemala.com/deportes/crossfit/gimnasios-box-para-hacer-crossfit-guatemala.html'

In [18]:
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [19]:
names = [soup.find_all('h3')[i].text for i in range(len(soup.find_all('h3'))-2)]
names.insert(2,'CrossFit 502')
names.insert(4,'Crossfit FD6')
names.insert(9,'CrossFit Spring City Once')

In [20]:
address = [soup.find_all('ul')[i].li.text.strip() for i in range(2,len(soup.find_all('ul'))-3)]

In [21]:
address.insert(1,address[1].split('|')[0])
address[2] = address[2].split('|')[1]
address.insert(3,address[3].split('|')[0])
address[4] = address[4].split('|')[1]
address.insert(8,address[8].split('|')[0])
address[9] = address[9].split('|')[1]

In [22]:
phone = [soup.find_all('ul')[i].find_all('li')[1].text.strip().split('Contacto:')[1].split('Facebook')[0].strip() for i in range(2,len(soup.find_all('ul'))-3)]
phone.insert(1,phone[1].split('|')[0])
phone[2] = phone[2].split('|')[1]
phone.insert(3,phone[3].split('|')[0])
phone[4] = phone[4].split('|')[1]
phone.insert(8,phone[8].split('|')[0])
phone[9] = phone[9].split('|')[1]

In [23]:
soup.find_all('a', href=True)[72]['href']

'https://www.facebook.com/invictusperformance/'

In [24]:
web = [soup.find_all('a', href=True)[i]['href'] for i in range(72,79)]

In [25]:
web.insert(2,'https://www.facebook.com/CF502/')
web.insert(4,'https://www.facebook.com/d6lifechangingfitness/')
web.insert(9,'https://www.facebook.com/crossfitspringcity7/')

In [26]:
description = ['crossfit' for i in range(len(names))]
category = ['crossfit' for i in range(len(names))]
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [27]:
category = ['crossfit' for i in range(len(names))]

In [28]:
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [29]:
crossfit = pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [30]:
def numero(x): #si la fecha tiene formaro de dd-mmm-aa devuelve únicamente el año
    if re.search(r"(\d)", x.lower()):
        ye = re.search(r"^\d?\d-\w\w\w-(\d\d\d?\d?)$", x.lower())
        x = ye.group(1)
        return x
    else:
        return x

In [32]:
crossfit.phone = crossfit.phone.apply(lambda x: re.search(r"(\d+\s\d+)",x).group(1))

In [33]:
crossfit.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Invictus Performance Training,Guatemala - Guatemala,"Ubicación: 8a. avenida 6-63, Zona 14",2367 1239,https://www.facebook.com/invictusperformance/,crossfit,crossfit
1,CrossFit 502,Guatemala - Guatemala,"Ubicación: Dinamia Cayalá, Zona 16",2368 5013,https://www.facebook.com/CF502/,crossfit,crossfit
2,CrossFit 502,Guatemala - Guatemala,"C.C. Escala, Carretera a El Salvador",6637 3778,https://www.facebook.com/CF502/,crossfit,crossfit
3,CrossFit FD6,Guatemala - Guatemala,"Ubicación: Plaza Futeca, Zona 14",2213 6900,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
4,Crossfit FD6,Guatemala - Guatemala,"C.C. Miraflores, Zona 11",2225 3200,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit


### Futeca

In [34]:
url = 'https://futecagym.com/sedes/'
response = requests.get(url).text

In [35]:
soup = BeautifulSoup(response,'lxml')

In [36]:
general = soup.find_all('div', class_='column_attr clearfix')

In [37]:
names= ['Futeca Gym - '+general[i].find_all('h4')[0].text for i in range(0,len(general),2) ]

In [38]:
address = [general[i].find_all('p')[0].text.split('Dirección')[1].strip() for i in range(0,len(general),2) ]

In [39]:
phone = [general[i].find_all('p')[1].text for i in range(0,len(general),2) ]

In [40]:
df = pd.DataFrame(zip(names,address,phone),columns=['names','address','phone'])

In [41]:
df['city'] = 'Guatemala - Guatemala'
df['category'] = 'Gimnasios de Educación Física'
df['web'] = 'https://futecagym.com/'
df['description'] = '''Futeca Gym, es un gimnasio que nace en el 2010. Enfocado en mejorar la calidad de vida de sus socios, a través del ejercicio, una buena alimentación y la creación de comunidades estrechas entre socios.'''

In [42]:
futeca = df[['names','city','address','phone','web','description','category']]

In [43]:
futeca.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Futeca Gym - Zona 14,Guatemala - Guatemala,"10ma. Avenida 10-50 zona 14. Plaza Futeca, Loc...",Teléfono: +502 22136900,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
1,Futeca Gym - Pradera Concepción,Guatemala - Guatemala,"Km. 15.5 Carretera a El Salvador, Finca Concep...",Teléfono: +502 66339866,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
2,Futeca Gym - Miraflores,Guatemala - Guatemala,"22 avenida, 7ma. calle zona 11, Paseo Miraflores",Teléfono: +502 22253200,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
3,Futeca Gym - San Cristóbal,Guatemala - Guatemala,Blvd. Principal 19-65 sector B1 San Cristóbal ...,Teléfono +502 24796470,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
4,Futeca Gym - Naranjo,Guatemala - Guatemala,"23 calle 10-00 Zona 4 Condado Naranjo , local ...",Whatsapp: +502 410001310,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física


### Fitness One

In [44]:
url = 'https://fitnessone.com.gt/pradera/'
response = requests.get(url).text
soup = BeautifulSoup(response,'lxml')

In [45]:
general = soup.find_all('h1')

In [46]:
locations =['cayala','pradera','rus','sankrismall']
names=[]
city=[]
address =[]
phone =[]
for location in locations:
    url = 'https://fitnessone.com.gt/{}/'.format(location)
    response = requests.get(url).text
    soup = BeautifulSoup(response,'lxml')
    general = soup.find_all('h1')
    names.append('Fitness One - '+general[1].text)
    city.append('Guatemala-Guatemala')
    address.append(soup.find_all('div',class_='et_pb_text_inner')[4].find_all('p')[0].text)
    phone.append(soup.find_all('div',class_='et_pb_text_inner')[4].find_all('p')[1].text)

In [47]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [48]:
df['city'] = 'Guatemala - Guatemala'
df['category'] = 'Gimnasios de Educación Física'
df['web'] = 'https://fitnessone.com.gt/'
df['description'] = 'Todo lo que necesitas para transformar tu vida y tu cuerpo, las mejores ubicaciones, personal capacitado para hacer del ejercicio parte de tu vida.'
fitnessone = df[['names','city','address','phone','web','description','category']]

In [49]:
fitnessone.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Fitness One - Cayalá,Guatemala - Guatemala,Diagonal 35 Boulevard Austriaco 16-25 Zona 16 ...,+502 2491-4333,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
1,Fitness One - Pradera,Guatemala - Guatemala,Boulevar Los Próceres 25-74 Zona 10,+502 2423-6000,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
2,Fitness One - RUS,Guatemala - Guatemala,Calzada Roosevelt 12-76 zona 7 C.C. RUS,+502 2226-2226,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
3,Fitness One - Sankris Mall,Guatemala - Guatemala,3ra. Calle Sector A-3 Boulevard San Cristóbal ...,+502 2424-4848,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física


### Scandinavia

In [50]:
url = 'https://www.scandinaviagym.com/#gimnasios'
#response = requests.get(url).text

In [51]:
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [52]:
general = soup.find_all('h3')

In [53]:
names = ['Scandinavia - '+name.text for name in general]

In [55]:
address = [ad.h4.text for ad in soup.find_all('div',class_='team-detail')]

In [56]:
phone= [phone.text for phone in soup.find_all('b')]

In [57]:
phone = [tel for tel in phone if tel!='']

In [58]:
phone[0] = 'Teléfonos: 2459-2586 y 2377-3013'
phone[3] = 'Teléfonos: 2269 5605'
phone[5] = 'Teléfonos: 3502 1704'
phone[6] = 'Teléfonos: 7832 6763'
phone[8] = 'Teléfonos: 2478 1566'
phone[9] = 'Teléfonos: 2212 7513'
phone[10] = 'Teléfonos: 2474 2815'
phone[12] = 'Teléfonos: 2335 7683'
phone[-3] = 'Teléfonos: 2211 0046'

In [59]:
city = ['Guatemala - Guatemala' for i in range(len(phone))]

In [61]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [62]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.scandinaviagym.com/'
df['description'] = 'Cadena de 14 gimnasios. Siempre Cerca de Ti. Amplios Horarios. En las principales ciudades de Guatemala.'
scandinavia = df[['names','city','address','phone','web','description','category']]

In [63]:
scandinavia.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Scandinavia - Aguilar Batres,Guatemala - Guatemala,"Calzada Aguilar Batres14-07, Zona 12",Teléfonos: 2459-2586 y 2377-3013,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
1,Scandinavia - Vista Hermosa,Guatemala - Guatemala,Boulevard Vista Hermosa 21-95 zona 15,Teléfonos: 2277-9296 y 2369-1018,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
2,Scandinavia - Carretera a El Salvador,Guatemala - Guatemala,"Km. 16.5, Carretera a El Salvador Salida a San...",Teléfonos: 2277-9297 y 6634-0633,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
3,Scandinavia - Metronorte,Guatemala - Guatemala,Centro Comercial Metronorte,Teléfonos: 2269 5605,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
4,Scandinavia - San Rafael,Guatemala - Guatemala,"km. 7.5 Carretera a El Atlántico zona 18, c.c....",Teléfonos: 2267 2919,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física


### Smart Fit

In [64]:
url = 'https://www.smartfit.com.gt/gimnasios'
response = requests.get(url).text
soup = BeautifulSoup(response,'lxml')

In [65]:
names = ['Smart Fit - '+soup.find_all('h3')[name].text for name in range(1,len(soup.find_all('h3'))-1)]

In [66]:
dires = soup.find_all('p',class_='Text')
address = [dires[add].text.split('-',1)[1] for add in range(len(dires))]

In [67]:
phone = [np.nan for i in range(len(address))]

In [68]:
city = ['Guatemala - Guatemala' for i in range(len(phone))]

In [69]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [70]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.smartfit.com.gt/'
df['description'] = 'Inaugurada en 2009, Smart Fit fue creada con el propósito de democratizar el acceso a la práctica de actividad física de alto nivel, con planes accesibles y adhesión facilitada.'
smartfit = df[['names','city','address','phone','web','description','category']]

In [71]:
smartfit.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Smart Fit - El Naranjo,Guatemala - Guatemala,"Plaza Kalú, locales R2-R5 - 23 calle 14-50 Zo...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
1,Smart Fit - Tikal Futura,Guatemala - Guatemala,Calzada Roosevelt 23-43 - Ciudad de Guatemal...,,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
2,Smart Fit - QUO Zona 4,Guatemala - Guatemala,"Via 7, 4-20, Zona 4, Torre Q, Segundo Nivel, ...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
3,Smart Fit - Los Próceres,Guatemala - Guatemala,"3ª Avenida 16-52, Z.10 - Centro Comercial L...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
4,Smart Fit - Rambla,Guatemala - Guatemala,Centro Comercial Rambla 10 - Boulevar Los Pró...,,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física


### Poltec 

In [72]:
url = 'http://www.poltecgym.com/'
#response = requests.get(url).text
#soup = BeautifulSoup(response,'lxml')

In [73]:
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [74]:
nombres = soup.find_all('h3')

names = ['Poltec - '+nombres[i].text.strip() for i in range(4,len(nombres)) ]

In [75]:
tel = soup.find_all('address')
phone = [tel[i].text.strip().split('T.')[1].split('\n')[0] for i in range(1,len(tel)-1) ]

In [76]:
address = [i for i in range(6)]
address[0]='Km. 5 Carretera al Atlántico Zona 17, Guatemala Guatemala'
address[1]='Ruta al Atlántico CC. Plaza San Rafael Zona 18 Guatemala'
address[2]='45 Calle 19-40, zona 12, C. C. Gran Portal Petapa '
address[3]='Calzada Roosevelt 13-88'
address[4]='Km. 17.5 Pacifico, Terreno #6, C.C. Santa Clara, Villa Nueva., Guatemala 01011, Guatemala'
address[5]='Dirección: Km. 17.9 Carretera a San Jose Pinula, Centro Comercial Local 14, El Faro'

In [77]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [78]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'http://www.poltecgym.com/'
df['description'] = 'El ejercicio es parte importante de un estilo de vida saludable. El ejercicio previene los problemas de salud, desarrolla resistencia, brinda más energía y puede ayudar a reducir el estrés.'
poltec = df[['names','city','address','phone','web','description','category']]


In [79]:
poltec.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Poltec - CC METRONORTE,Guatemala - Guatemala,"Km. 5 Carretera al Atlántico Zona 17, Guatemal...",2256-0882,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
1,Poltec - CC PLAZA SAN RAFAEL,Guatemala - Guatemala,Ruta al Atlántico CC. Plaza San Rafael Zona 18...,2261-6391,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
2,Poltec - CC GRAN PORTAL PETAPA,Guatemala - Guatemala,"45 Calle 19-40, zona 12, C. C. Gran Portal Pet...",2460-3185,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
3,Poltec - CC ASIA MALL (ROOSEVELT),Guatemala - Guatemala,Calzada Roosevelt 13-88,24743214,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
4,Poltec - CC SANTA CLARA,Guatemala - Guatemala,"Km. 17.5 Pacifico, Terreno #6, C.C. Santa Clar...",6644-8279,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física


### Curves

In [80]:
url = 'https://www.curveslatinoamerica.com/guatemala.html#'
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [81]:
general = soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 col-lg-6')

In [82]:
names = ['Curves - '+general[i].find('p').text.strip() for i in range(len(general)) ]

In [83]:
address = [general[i].find('small').text.strip().split('\n')[0] for i in range(len(general)) ]

In [84]:
phone = [general[i].find('small').text.strip().split('\n')[1].strip() for i in range(len(general)) ]

In [85]:
city = ['Guatemala - Guatemala', 'Quetzaltenango']

In [86]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [87]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.curveslatinoamerica.com/guatemala.html#'
df['description'] = 'Curves es un gimnasio para mujeres, la franquicia ofrece un programa de perdida de peso saludable y razonable.'
curves = df[['names','city','address','phone','web','description','category']]

In [88]:
curves

Unnamed: 0,names,city,address,phone,web,description,category
0,Curves - Reforma Zona 15,Guatemala - Guatemala,1a calle 22-13 zona 15 Vista Hermosa II C.P. 1...,Teléfonos: (502) 2369 2622 | (502) 2369 2644 |...,https://www.curveslatinoamerica.com/guatemala....,"Curves es un gimnasio para mujeres, la franqui...",Gimnasios de Educación Física
1,Curves - Quetzaltenango,Guatemala - Guatemala,"Avenida 9-66 Zona 3, Plaza Delco C.P. 9001",Teléfono: (502) 7763 6831,https://www.curveslatinoamerica.com/guatemala....,"Curves es un gimnasio para mujeres, la franqui...",Gimnasios de Educación Física


### Dinamic Fitness

In [93]:
dinamic_fitness_url= ['https://dinamic.com.gt/naranjo/',
                      'https://dinamic.com.gt/forum-zona-10/',
                      'https://dinamic.com.gt/san-juan/',
                      'https://dinamic.com.gt/villa-hermosa/',
                      'https://dinamic.com.gt/portales/',
                      'https://dinamic.com.gt/roosevelt/']

In [94]:
def dinamic_fitness(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    names = ['Dinamic Fitness - ' + soup.find('h2').text]
    city = ['Guatemala - Guatemala']
    
    if len(soup.find_all('h3'))<=2:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[1]
    elif len(soup.find_all('h3'))==4:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[3]
    else:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[2]
        
    web = ['https://dinamic.com.gt/']
    description = ['Somos un nuevo tipo de gym, un lugar para ponerte en forma. ¡Un lugar para ser feliz!Nuestros programas de ejercicios están diseñados bajo una planificación de entrenamiento cardiovascular y de fuerza. Todo con un soporte científico que garantiza que obtendrás resultados de forma segura.']
    category = ['Gimnasios de Educación Física']
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [95]:
dinamicfitness=pd.DataFrame()
for url in dinamic_fitness_url:
    bucle = dinamic_fitness(url)
    dinamicfitness = pd.concat([dinamicfitness,bucle],ignore_index=True)

In [96]:
dinamicfitness.phone[0] = np.nan

In [97]:
dinamicfitness.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Dinamic Fitness - PASAJE NARANJO,Guatemala - Guatemala,10 avenida 18-58 zona 4 de mixco Condado Naran...,,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
1,Dinamic Fitness - FORUM ZONA 10,Guatemala - Guatemala,3ra ave 10-80 Zona 10 Edificio Forum Zona Viva...,TEL: 2214-0707,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
2,Dinamic Fitness - CENTRO 21,Guatemala - Guatemala,[CALZADA SAN JUAN 21-14 ],[TEL: 2243-0003],https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
3,Dinamic Fitness - VILLA HERMOSA,Guatemala - Guatemala,CENTRO COMERCIAL PACIFIC VILLA HERMOSA,TEL: 2214-0505,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
4,Dinamic Fitness - ZONA PORTALES,Guatemala - Guatemala,3ra. Av. 10-10 Zona 17 km. 4.5 Ruta al Atlanti...,TEL: 2217-1111,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física


In [None]:
gt = pd.read_csv('gt_V1.csv')
gt.drop(columns='Unnamed: 0',inplace=True)
gt

### GT 

In [102]:
paginas = [i for i in range(1,12)]

In [103]:
h2s=[]
for p in paginas:
    response = requests.get('https://directorio.guatemala.com/listado/guia/gimnasios/pagina/{}'.format(p))
    h2s.append(BeautifulSoup(response.text,'lxml').find_all('h2'))

In [104]:
directorio = []
for h2 in h2s:
    for h in h2:
        directorio.append(h.text.strip())

In [105]:
directorio = set(directorio)
directorio = list(directorio)

In [106]:
slug = [slugify(i) for i in directorio]

In [107]:
url = 'https://directorio.guatemala.com/listado/{}.html'.format(slug[0])
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [108]:
def gtcom(url):
    malos=[]
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    business = soup.find('div',class_='content-info-business')
    try:
        names = [business.find('h1').text]
    except:
        names = [np.nan]
        malos.append(sl)
    try:
        address = [business.find('span').text]
        city = ['Guatemala - Guatemala']
    except:
        address =[np.nan]
        city=[np.nan]
    try:
        phone = [soup.find('div',class_='business-buttons').find('a',href=True)['href']]
    except:
        phone=[np.nan]
    try:
        web = [soup.find('div',class_='business-buttons').find_all('a',href=True)[2]['href']]
    except:
        web=[np.nan]
    try:
        description = [soup.find('div',class_='info-description').text.strip().split('Sobre la empresa\n')[1].strip()]
    except:
        description =[np.nan]
    try:
        category = [soup.find('div',class_='business-categories').text.strip().split('Listado en: \n\n')[1].strip()]
    except:
        category=[np.nan]
    
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [109]:
gt = pd.DataFrame()
for sl in slug:
    bucle = gtcom('https://directorio.guatemala.com/listado/{}.html'.format(sl))
    
    gt = pd.concat([gt,bucle],ignore_index=True)
    

In [110]:
gt.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Zenith Training Center,Guatemala - Guatemala,"18 avenida ""A"", 1-17 Vista Hermosa II, Zona 15",tel:+(502) 2310-0546,https://directorio.guatemala.com/listing_repor...,¡Ya olvídate de los gimnasios convencionales! ...,TRX\n \n\n ...
1,Active Mom,Guatemala - Guatemala,13 calle A 7-19 Centro Comercial Tiffany,tel:+(502) 2363-6424,,Active Mom es un gimnasio diseñado para mujere...,Yoga
2,Micheo Boxing by Futeca Zona 14,Guatemala - Guatemala,10a Avenida 10-50 Zona 14,tel:+(502) 2366-8052,,En Micheo Boxing by Futeca somos una academia ...,Boxeo
3,Neurogym Antigua,Guatemala - Guatemala,5a. Avenida Norte #17,tel:(502) 7831-5210 | 5892-2527,https://directorio.guatemala.com/listing_repor...,Neurogym Antigua ofrece un programa de estimul...,Pesas
4,Ananda Yoga Estudio,Guatemala - Guatemala,Diagonal 6 13-63 Zona 10,tel:+(502) 3025-5117,https://directorio.guatemala.com/listing_repor...,"Ananda proviene del sanscrito ""felicidad supre...",Yoga
