In [11]:
import re
import requests
from bs4 import BeautifulSoup
import unidecode
import numpy as np
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from tqdm import tqdm
from tqdm import trange

# Obtención de información

Each establishment will initially consist of following fields:
- id
- name
- description
- phone
- website
- hours
- category
- address
- zip code
- city
- country
- latitude
- longitude

In [2]:
def get_yellow_info_improved(url,category):
    
    names=[]
    city=[]
    address=[]
    phone=[]
    web=[]
    description=[]
    
    response = requests.get(url)
    
    #get soup
    soup = BeautifulSoup(response.text,'lxml')
    
    #getting main box
    general = soup.find_all('div', itemtype="https://schema.org/LocalBusiness")

    
    if len(general)>0:
        for i in range(len(general)):

            #NAMES
            try:
                names.append(general[i].find('a', class_='companyName').text.split('\n')[1].strip())
            except:
                names.append(np.nan)

           #CITY
            try:
                city.append(general[i].find('span', class_='city').text.strip())
            except:
                city.append(np.nan)

            #Address
            try:
                address.append(general[i].find('span', class_='directionFig').text.split('\n')[1].strip())
            except:
                address.append(np.nan)


            #phone
            try:
                phone.append(general[i].find('span', class_='phoneFig hide').text.strip())
            except:
                phone.append(np.nan)

            #web
            try:
                web.append(general[i].find('a', class_='webLink').text.strip())

            except:
                web.append(np.nan)

            #description
            try:
                description.append(general[i].find('div', class_='col-sm-12 infoBox').text.strip())
            except:
                description.append(np.nan)
                
        
        #creating dataframe
        df = pd.DataFrame(zip(names, city, address,phone,web,description),
                              columns=['names','city','address','phone','web','description'])
        
        df['categoría'] = category
        
        return df
                
            
    else:
        return print('no se pudo, response:', response.status_code)

# First Batch

In [8]:
lista_url = [
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=1',
         'Gimnasios de educacion física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=2',
         'Gimnasios de educacion física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=3',
         'Gimnasios de educacion física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones-deportivas',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/academias-de-ballet', 
         'ballet'],
    
        ['https://www.paginasamarillas.com.gt/servicios/bolos',
         'boliche'],
    
        ['https://www.paginasamarillas.com.gt/servicios/artes-marciales-ensenanza-de',
         'artes marciales'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=1',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=2',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=3',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=4',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/entrenamiento-fisico',
         'entrenamiento fisico']
        ]

In [10]:
df = pd.DataFrame()

In [12]:

for url in lista_url:
    bucle = get_yellow_info_improved(url[0],url[1])
    df = pd.concat([df,bucle],ignore_index=True)
    

In [19]:
df.tail()

Unnamed: 0,names,city,address,phone,web,description,categoría
164,San Silvestre Spa Sauna Y Jacuzzi,Guatemala - Guatemala,1 Av Cl A 4-64 Z 7 Almolonga Quetzaltenango,(+502) 7766 8587,,,entrenamiento fisico
165,Sporta Guatemala S.A.,Guatemala - Guatemala,KM 18.5 CARRET A EL SALVADOR,(+502) 6625 9100,,,entrenamiento fisico
166,Vmedical Spa,Guatemala - Guatemala,3 Cl A 8-38 Edif Renova Z 10 3 Nivel,(+502) 3006 3604,,,entrenamiento fisico
167,Fiore,Guatemala - Guatemala,Av Americas 9-55 Plaza Oro L 10,(+502) 2366 7166,,,entrenamiento fisico
168,Hostal Los Volcanes,Guatemala - Guatemala,16 Calle 8-00 Zona 13 Aurora 1,(+502) 2360 3232,,,entrenamiento fisico


# Second Batch

In [22]:
lista_url2 = [
                ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-artes-marciales',
                 'artes marciales'],

                ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-karate',
                 'artes marciales'],

                ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-natacion',
                 'natacion'],

                ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-vuelo-libre',
                 'aviacion'],

                ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-aviacion',
                 'aviacion'],

                ['https://www.paginasamarillas.com.gt/servicios/golf',
                 'golf'],

                ['https://www.paginasamarillas.com.gt/servicios/hipodromos',
                 'hipodromos']
                ]

In [23]:
df2 = pd.DataFrame()

In [24]:

for url in lista_url2:
    bucle = get_yellow_info_improved(url[0],url[1])
    df2 = pd.concat([df2,bucle],ignore_index=True)
    

In [25]:
df2

Unnamed: 0,names,city,address,phone,web,description,categoría
0,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales
1,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales
2,Schumann'S Cordova,Guatemala - Guatemala,Blvd San Cristobal Centro Comercial San Cristo...,(+502) 2478 1097,http://www.schumanscordova.com,,artes marciales
3,"Empresas De Competencia, Sociedad Anonima",Guatemala - Guatemala,Boulevard Rafael Landivar Paseo Cayala L 209,(+502) 2493 8000,,,artes marciales
4,"Grupo Aquatic, S.A.",Guatemala - Guatemala,"Km 17.5 Boulevard San José Villa Nueva, Frente...",(+502) 2312 0808,www.aquaticcenterguate.com,"Clases de natación para bebés, niños y adultos...",natacion
5,Waterproof Swim Academy,Guatemala - Guatemala,15 Av A y 3 Cl D Mixco Z 8,(+502) 2375 8195,,,natacion
6,"Federacion Nacional De Natacion, Clavados, Pol...",Guatemala - Guatemala,10 Av Z-4,(+502) 2334 1075,http://www.fenadegua.gt,,natacion
7,Asociacion Nacional De Vuelo Libre De Guatemala,Guatemala - Guatemala,10 Calle 2-28 Z 9 Casa de la Cultura,(+502) 2385 1221,www.vuelolibre.com.gt,Promover la práctica del vuelo libre como herr...,aviacion
8,Gt Aviation,Guatemala - Guatemala,Avenida Hincapié y 18 Calle Zona 13 Hangar K-2,(+502) 2458 4307,,,aviacion
9,"Golf Total, S.A.",Guatemala - Guatemala,Av 30 Final Z.11,(+502) 2417 0000,,,golf


In [26]:
df = pd.concat([df,df2], ignore_index=True)

In [27]:
df.shape

(181, 7)

In [28]:
df.head()

Unnamed: 0,names,city,address,phone,web,description,categoría
0,Full Contact Studio,Guatemala - Guatemala,2 C 10-06 Z-1,(+502) 2232 6575,,,Gimnasios de educacion física
1,Club Atletico Atenas,Guatemala - Guatemala,Diagonal 14. 22-06 Zona 5,(+502) 2335 3680,http://www.clubatenas.com,,Gimnasios de educacion física
2,Le Petit Spa,Guatemala - Guatemala,Dg 6 14-86 Z 10,(+502) 2366 7508,http://www.lepetitspagt.com,,Gimnasios de educacion física
3,Madiel Spa,Guatemala - Guatemala,Av las Americas 7-30 Z 13 Edif Real América L ...,(+502) 2360 5070,http://www.madielspa.es.tl,,Gimnasios de educacion física
4,Armonia Colonial,Guatemala - Guatemala,Boulevard Principal 23 Calle 18-96 Col. Villa ...,(+502) 2448 2553,,,Gimnasios de educacion física


In [72]:
baile('https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html','baile description','webtest')

Unnamed: 0,names,city,address,phone,web,description,category
0,Scenic Dance Guatemala,Guatemala - Guatemala,"3a. avenida, 14-19 zona 14, Ciudad de Guatemal...",2368-0858 | 2478-4688,webtest,¿Listo para aprender nuevos pasos? Scenic Danc...,baile description


In [58]:
url = 'https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html'


In [71]:
def baile(url,category,web):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    names = [soup.find('h1').text]
    city = ['Guatemala - Guatemala']
    address = [soup.find('p',class_='address').text]
    phone = [soup.find('div',class_='schedule-box span_mobile12 span_tablet3 span3').text.strip().split('\n')[1]]
    description = [soup.find('div',class_='single-content').p.text.strip()]
    category = [category]
    web = [web]
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])
    
    

In [75]:
lista_url = [
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html',
         'Baile','https://es-la.facebook.com/ScenicDance/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/salsa-latin-guatemala.html',
        'Baile','https://www.facebook.com/Salsa-Latin-Guatemala-178268719078/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-dance-art.html',
         'Baile','https://es-la.facebook.com/Danceartguatemala/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-bellydance-rashida.html',
        'Baile', 'https://es-la.facebook.com/Rashidaacademy/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/addiction-dance-studio.html',
        'Baile','https://es-la.facebook.com/adsgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-cubana-de-baile-.html',
        'Baile','https://es-la.facebook.com/Escuela-Cubana-de-Baile-110562958135/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/in-motion-dance-fitness-guatemala.html',
        'Baile','http://www.inmotiongt.com/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-de-ballet-danzarte.html',
        'Baile','https://es-la.facebook.com/Danzarte.ballet/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/ritmo-y-sabor-guatemala.html',
        'Baile','https://es-la.facebook.com/ritmoysaborgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-municipal-de-danza-clasica.html',
        'Baile','https://es-la.facebook.com/EMDCgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/heroes-academy-guatemala-.html',
        'Baile','https://es-la.facebook.com/HeroesCompanyGT/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unidanza-guatemala-.html',
        'Baile','http://unidanza.net/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unlimited-dance-academy.html',
        'Baile','https://unlimited-dance.com.gt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/the-dance-factory.html',
        'Baile','https://es-la.facebook.com/DanceFactoryGT/']
    

        ]

In [76]:
df=pd.DataFrame()
for url in lista_url:
    bucle = baile(url[0],url[1],url[2])
    df2 = pd.concat([df2,bucle],ignore_index=True)

In [77]:
df2

Unnamed: 0,names,city,address,phone,web,description,categoría,category
0,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales,
1,Cosenza Academy,San José Pinula - Guatemala,"Km 18.9 Carretera A San José Pinula, C.C. la P...",(+502) 6637 5035,www.cosenzaacademy.com,Buscamos poder contribuir a la formación integ...,artes marciales,
2,Schumann'S Cordova,Guatemala - Guatemala,Blvd San Cristobal Centro Comercial San Cristo...,(+502) 2478 1097,http://www.schumanscordova.com,,artes marciales,
3,"Empresas De Competencia, Sociedad Anonima",Guatemala - Guatemala,Boulevard Rafael Landivar Paseo Cayala L 209,(+502) 2493 8000,,,artes marciales,
4,"Grupo Aquatic, S.A.",Guatemala - Guatemala,"Km 17.5 Boulevard San José Villa Nueva, Frente...",(+502) 2312 0808,www.aquaticcenterguate.com,"Clases de natación para bebés, niños y adultos...",natacion,
5,Waterproof Swim Academy,Guatemala - Guatemala,15 Av A y 3 Cl D Mixco Z 8,(+502) 2375 8195,,,natacion,
6,"Federacion Nacional De Natacion, Clavados, Pol...",Guatemala - Guatemala,10 Av Z-4,(+502) 2334 1075,http://www.fenadegua.gt,,natacion,
7,Asociacion Nacional De Vuelo Libre De Guatemala,Guatemala - Guatemala,10 Calle 2-28 Z 9 Casa de la Cultura,(+502) 2385 1221,www.vuelolibre.com.gt,Promover la práctica del vuelo libre como herr...,aviacion,
8,Gt Aviation,Guatemala - Guatemala,Avenida Hincapié y 18 Calle Zona 13 Hangar K-2,(+502) 2458 4307,,,aviacion,
9,"Golf Total, S.A.",Guatemala - Guatemala,Av 30 Final Z.11,(+502) 2417 0000,,,golf,


# Crossfit

In [78]:
url = 'https://www.guatemala.com/deportes/crossfit/gimnasios-box-para-hacer-crossfit-guatemala.html'

In [80]:
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [168]:
names = [soup.find_all('h3')[i].text for i in range(len(soup.find_all('h3'))-2)]
names.insert(2,'CrossFit 502')
names.insert(4,'Crossfit FD6')
names.insert(9,'CrossFit Spring City Once')
names

['Invictus Performance Training',
 'CrossFit 502',
 'CrossFit 502',
 'CrossFit FD6',
 'Crossfit FD6',
 'Sense CrossFit',
 'AMSI CrossFit',
 'Brau Athletics CrossFit',
 'CrossFit Spring City Once',
 'CrossFit Spring City Once',
 'Oaks CrossFit']

In [174]:
address = [soup.find_all('ul')[i].li.text.strip() for i in range(2,len(soup.find_all('ul'))-3)]
address

['Ubicación: 8a. avenida 6-63, Zona 14',
 'Ubicación:\xa0 Dinamia Cayalá, Zona 16 | C.C. Escala, Carretera a El Salvador',
 'Ubicación: Plaza Futeca, Zona 14 | C.C. Miraflores, Zona 11',
 'Ubicación:\xa09a. avenida 16-03, Zona 10',
 'Ubicación: 8a. avenida, 9-28, San Cristóbal, Zona 8 de Mixco',
 'Ubicación: Plaza Empresarial Muxbal, Santa Catarina Pinula',
 'Ubicación: C.C. Via Majadas, Zona 11 | Expobodegas Petapa, Zona 12',
 'Ubicación: Km. 19 Carretera a San José Pinula, San José Pinula']

In [175]:
address.insert(1,address[1].split('|')[0])
address[2] = address[2].split('|')[1]
address.insert(3,address[3].split('|')[0])
address[4] = address[4].split('|')[1]
address.insert(8,address[8].split('|')[0])
address[9] = address[9].split('|')[1]
address

['Ubicación: 8a. avenida 6-63, Zona 14',
 'Ubicación:\xa0 Dinamia Cayalá, Zona 16 ',
 ' C.C. Escala, Carretera a El Salvador',
 'Ubicación: Plaza Futeca, Zona 14 ',
 ' C.C. Miraflores, Zona 11',
 'Ubicación:\xa09a. avenida 16-03, Zona 10',
 'Ubicación: 8a. avenida, 9-28, San Cristóbal, Zona 8 de Mixco',
 'Ubicación: Plaza Empresarial Muxbal, Santa Catarina Pinula',
 'Ubicación: C.C. Via Majadas, Zona 11 ',
 ' Expobodegas Petapa, Zona 12',
 'Ubicación: Km. 19 Carretera a San José Pinula, San José Pinula']

In [176]:
phone = [soup.find_all('ul')[i].find_all('li')[1].text.strip().split('Contacto:')[1].split('Facebook')[0].strip() for i in range(2,len(soup.find_all('ul'))-3)]
phone.insert(1,phone[1].split('|')[0])
phone[2] = phone[2].split('|')[1]
phone.insert(3,phone[3].split('|')[0])
phone[4] = phone[4].split('|')[1]
phone.insert(8,phone[8].split('|')[0])
phone[9] = phone[9].split('|')[1]
phone

['2367 1239 |',
 '2368 5013 en Dinamia Cayalá ',
 ' 6637 3778 en Escala ',
 '2213 6900 en Plaza Futeca ',
 ' 2225 3200 en Miraflores ',
 '2366 5836 |',
 '2480 1592 |',
 '2441 9008 |',
 '2473 7396 en Majadas ',
 ' 2477 4632 en Petapa ',
 '6626 6576 |']

In [142]:
soup.find_all('a', href=True)[72]['href']

'https://www.facebook.com/invictusperformance/'

In [148]:
web = [soup.find_all('a', href=True)[i]['href'] for i in range(72,79)]

In [179]:
web.insert(2,'https://www.facebook.com/CF502/')
web.insert(4,'https://www.facebook.com/d6lifechangingfitness/')
web.insert(9,'https://www.facebook.com/crossfitspringcity7/')

In [180]:
description = ['crossfit' for i in range(len(names))]
category = ['crossfit' for i in range(len(names))]
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [181]:
category = ['crossfit' for i in range(len(names))]

In [183]:
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [189]:
crossfit = pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [None]:
def numero(x): #si la fecha tiene formaro de dd-mmm-aa devuelve únicamente el año
    if re.search(r"(\d)", x.lower()):
        ye = re.search(r"^\d?\d-\w\w\w-(\d\d\d?\d?)$", x.lower())
        x = ye.group(1)
        return x
    else:
        return x

In [186]:
crossfit

Unnamed: 0,names,city,address,phone,web,description,category
0,Invictus Performance Training,Guatemala - Guatemala,"Ubicación: 8a. avenida 6-63, Zona 14",2367 1239 |,https://www.facebook.com/invictusperformance/,crossfit,crossfit
1,CrossFit 502,Guatemala - Guatemala,"Ubicación: Dinamia Cayalá, Zona 16",2368 5013 en Dinamia Cayalá,https://www.facebook.com/CF502/,crossfit,crossfit
2,https://www.facebook.com/CF502/,Guatemala - Guatemala,"C.C. Escala, Carretera a El Salvador",6637 3778 en Escala,https://www.facebook.com/CF502/,crossfit,crossfit
3,CrossFit 502,Guatemala - Guatemala,"Ubicación: Plaza Futeca, Zona 14",2213 6900 en Plaza Futeca,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
4,https://www.facebook.com/d6lifechangingfitness/,Guatemala - Guatemala,"C.C. Miraflores, Zona 11",2225 3200 en Miraflores,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
5,CrossFit FD6,Guatemala - Guatemala,"Ubicación: 9a. avenida 16-03, Zona 10",2366 5836 |,https://www.facebook.com/amsicrossfit/,crossfit,crossfit
6,Crossfit FD6,Guatemala - Guatemala,"Ubicación: 8a. avenida, 9-28, San Cristóbal, Z...",2480 1592 |,https://www.facebook.com/brauathletics/,crossfit,crossfit
7,Sense CrossFit,Guatemala - Guatemala,"Ubicación: Plaza Empresarial Muxbal, Santa Cat...",2441 9008 |,https://www.facebook.com/crossfitspringcity7/,crossfit,crossfit
8,AMSI CrossFit,Guatemala - Guatemala,"Ubicación: C.C. Via Majadas, Zona 11",2473 7396 en Majadas,https://www.facebook.com/OaksCrossFit/,crossfit,crossfit
9,https://www.facebook.com/crossfitspringcity7/,Guatemala - Guatemala,"Expobodegas Petapa, Zona 12",2477 4632 en Petapa,https://www.facebook.com/crossfitspringcity7/,crossfit,crossfit


In [190]:
crossfit.phone = crossfit.phone.apply(lambda x: re.search(r"(\d+\s\d+)",x).group(1))

In [191]:
crossfit

Unnamed: 0,names,city,address,phone,web,description,category
0,Invictus Performance Training,Guatemala - Guatemala,"Ubicación: 8a. avenida 6-63, Zona 14",2367 1239,https://www.facebook.com/invictusperformance/,crossfit,crossfit
1,CrossFit 502,Guatemala - Guatemala,"Ubicación: Dinamia Cayalá, Zona 16",2368 5013,https://www.facebook.com/CF502/,crossfit,crossfit
2,https://www.facebook.com/CF502/,Guatemala - Guatemala,"C.C. Escala, Carretera a El Salvador",6637 3778,https://www.facebook.com/CF502/,crossfit,crossfit
3,CrossFit 502,Guatemala - Guatemala,"Ubicación: Plaza Futeca, Zona 14",2213 6900,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
4,https://www.facebook.com/d6lifechangingfitness/,Guatemala - Guatemala,"C.C. Miraflores, Zona 11",2225 3200,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
5,CrossFit FD6,Guatemala - Guatemala,"Ubicación: 9a. avenida 16-03, Zona 10",2366 5836,https://www.facebook.com/amsicrossfit/,crossfit,crossfit
6,Crossfit FD6,Guatemala - Guatemala,"Ubicación: 8a. avenida, 9-28, San Cristóbal, Z...",2480 1592,https://www.facebook.com/brauathletics/,crossfit,crossfit
7,Sense CrossFit,Guatemala - Guatemala,"Ubicación: Plaza Empresarial Muxbal, Santa Cat...",2441 9008,https://www.facebook.com/crossfitspringcity7/,crossfit,crossfit
8,AMSI CrossFit,Guatemala - Guatemala,"Ubicación: C.C. Via Majadas, Zona 11",2473 7396,https://www.facebook.com/OaksCrossFit/,crossfit,crossfit
9,https://www.facebook.com/crossfitspringcity7/,Guatemala - Guatemala,"Expobodegas Petapa, Zona 12",2477 4632,https://www.facebook.com/crossfitspringcity7/,crossfit,crossfit


In [192]:
url = 'https://dinamic.com.gt/forum-zona-10/'

response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')


In [195]:
names = soup.find('h2').text

In [196]:
city = ''

In [198]:
address = soup.find_all('h3')[0]

In [199]:
phone = soup.find_all('h3')[1]

In [200]:
web = 'https://dinamic.com.gt/'

In [201]:
description = 'Somos un nuevo tipo de gym, un lugar para ponerte en forma. ¡Un lugar para ser feliz!Nuestros programas de ejercicios están diseñados bajo una planificación de entrenamiento cardiovascular y de fuerza. Todo con un soporte científico que garantiza que obtendrás resultados de forma segura.'

In [202]:
category = 'Gimnasios de Educación Física'

In [207]:
def dinamic_fitness(url):
    names = [soup.find('h2').text]
    city = ['Guatemala - Guatemala']
    address = soup.find_all('h3')[0]
    phone = soup.find_all('h3')[1]
    web = ['https://dinamic.com.gt/']
    description = ['Somos un nuevo tipo de gym, un lugar para ponerte en forma. ¡Un lugar para ser feliz!Nuestros programas de ejercicios están diseñados bajo una planificación de entrenamiento cardiovascular y de fuerza. Todo con un soporte científico que garantiza que obtendrás resultados de forma segura.']
    category = ['Gimnasios de Educación Física']
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [208]:
dinamic_fitness(url)

Unnamed: 0,names,city,address,phone,web,description,category
0,FORUM ZONA 10,Guatemala - Guatemala,3ra ave 10-80 Zona 10 Edificio Forum Zona Viva...,TEL: 2214-0707,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física


In [None]:
dinamic_fitness_url= ['https://dinamic.com.gt/naranjo/',
                      'https://dinamic.com.gt/forum-zona-10/',
                      'https://dinamic.com.gt/san-juan/',
                      'https://dinamic.com.gt/villa-hermosa/',
                      'https://dinamic.com.gt/portales/',
                      'https://dinamic.com.gt/roosevelt/']

In [210]:
len(soup.find_all('h3'))

2

In [220]:
from selenium.webdriver.common.keys import Keys


In [219]:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)

'esto-es-un-test'

In [232]:
import time
scrap='https://directorio.guatemala.com/listado/guia/gimnasios/pagina/{}'.format(l)

soup = BeautifulSoup(response.text,'lxml')

In [236]:
paginas = [i for i in range(1,12)]

In [245]:
h2s=[]
for p in paginas:
    response = requests.get('https://directorio.guatemala.com/listado/guia/gimnasios/pagina/{}'.format(p))
    h2s.append(BeautifulSoup(response.text,'lxml').find_all('h2'))

In [262]:
h2s[0][0].text

'Filtra tu búsqueda'

In [265]:
directorio = []
for h2 in h2s:
    for h in h2:
        directorio.append(h.text.strip())

In [269]:
directorio = set(directorio)
directorio = list(directorio)

In [275]:
len(directorio)

101

In [277]:
from slugify import slugify
slug = [slugify(i) for i in directorio]

In [279]:
url = 'https://directorio.guatemala.com/listado/{}.html'.format(slug[0])
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [349]:
def gtcom(url):
    malos=[]
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    business = soup.find('div',class_='content-info-business')
    try:
        names = [business.find('h1').text]
    except:
        names = [np.nan]
        malos.append(sl)
    try:
        address = [business.find('span').text]
        city = ['Guatemala - Guatemala']
    except:
        address =[np.nan]
        city=[np.nan]
    try:
        phone = [soup.find('div',class_='business-buttons').find('a',href=True)['href']]
    except:
        phone=[np.nan]
    try:
        web = [soup.find('div',class_='business-buttons').find_all('a',href=True)[2]['href']]
    except:
        web=[np.nan]
    try:
        description = [soup.find('div',class_='info-description').text.strip().split('Sobre la empresa\n')[1].strip()]
    except:
        description =[np.nan]
    try:
        category = [soup.find('div',class_='business-categories').text.strip().split('Listado en: \n\n')[1].strip()]
    except:
        category=[np.nan]
    
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [350]:
gt = pd.DataFrame()
for sl in slug:
    bucle = gtcom('https://directorio.guatemala.com/listado/{}.html'.format(sl))
    
    gt = pd.concat([gt,bucle],ignore_index=True)
    

In [352]:
gt.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Fight Club,Guatemala - Guatemala,24 ave 47-64 zona 12 Prados de Monte Maria,tel:(5025) 4522-0530,https://directorio.guatemala.com/listing_repor...,La cualidad dual de Fight Club Guatemala le de...,Artes Marciales\n ...
1,CrossFit 502 Carr. a El Salvador,Guatemala - Guatemala,Km. 14.5 Carretera a El Salvador,tel:+(502) 6637-3778,https://directorio.guatemala.com/listing_repor...,En CrossFit 502 nos enfocamos en que nuestros ...,Crossfit
2,Neurogym Antigua,Guatemala - Guatemala,5a. Avenida Norte #17,tel:(502) 7831-5210 | 5892-2527,https://directorio.guatemala.com/listing_repor...,Neurogym Antigua ofrece un programa de estimul...,Pesas
3,Dinamic Fitness Zona Portales,Guatemala - Guatemala,3a Avenida 10-10 Zona 17,tel:+(502) 2217-1111,https://directorio.guatemala.com/listing_repor...,"Dinamic es un concepto nuevo de fitness, un lu...",Pesas
4,,,,,,,


In [354]:
gt = gt.dropna(axis=0,how='all')

In [355]:
gt.to_csv('gt_V1.csv')

In [285]:
business = soup.find('div',class_='content-info-business')

In [331]:
names = [business.find('h1').text]

In [332]:
city = ['Guatemala - Guatemala']

In [333]:
address = [business.find('span').text]

In [334]:
phone = [soup.find('div',class_='business-buttons').find('a',href=True)['href']]

In [335]:
web = [soup.find('div',class_='business-buttons').find_all('a',href=True)[2]['href']]

In [336]:
description = [soup.find('div',class_='info-description').text.strip().split('Sobre la empresa\n')[1].strip()]

In [337]:
category = [soup.find('div',class_='business-categories').text.strip().split('Listado en: \n\n')[1].strip()]

In [338]:
pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

Unnamed: 0,names,city,address,phone,web,description,category
0,Fight Club,Guatemala - Guatemala,24 ave 47-64 zona 12 Prados de Monte Maria,tel:(5025) 4522-0530,https://directorio.guatemala.com/listing_repor...,La cualidad dual de Fight Club Guatemala le de...,Artes Marciales\n ...
