In [1]:
import re
import requests
from bs4 import BeautifulSoup
import unidecode
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from tqdm import tqdm
from tqdm import trange
import time
from slugify import slugify

# Getting Data

Each establishment will initially consist of following fields:
- id
- name
- description
- phone
- website
- hours
- category
- address
- zip code
- city
- country
- latitude
- longitude

### Yellow Pages

In [2]:
def get_yellow_info(url,category):
    
    names=[]
    city=[]
    address=[]
    phone=[]
    web=[]
    description=[]
    
    response = requests.get(url)
    
    #get soup
    soup = BeautifulSoup(response.text,'lxml')
    
    #getting main box
    general = soup.find_all('div', itemtype="https://schema.org/LocalBusiness")

    
    if len(general)>0:
        for i in range(len(general)):

            #NAMES
            try:
                names.append(general[i].find('a', class_='companyName').text.split('\n')[1].strip())
            except:
                names.append(np.nan)

           #CITY
            try:
                city.append(general[i].find('span', class_='city').text.strip())
            except:
                city.append(np.nan)

            #Address
            try:
                address.append(general[i].find('span', class_='directionFig').text.split('\n')[1].strip())
            except:
                address.append(np.nan)


            #phone
            try:
                phone.append(general[i].find('span', class_='phoneFig hide').text.strip())
            except:
                phone.append(np.nan)

            #web
            try:
                web.append(general[i].find('a', class_='webLink').text.strip())

            except:
                web.append(np.nan)

            #description
            try:
                description.append(general[i].find('div', class_='col-sm-12 infoBox').text.strip())
            except:
                description.append(np.nan)
                
        
        #creating dataframe
        df = pd.DataFrame(zip(names, city, address,phone,web,description),
                              columns=['names','city','address','phone','web','description'])
        
        df['category'] = category
        
        return df
                
            
    else:
        return print('no se pudo, response:', response.status_code)

In [3]:
lista_url = [
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=1',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=2',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/gimnasios-de-educacion-fisica?page=3',
         'Gimnasios de Educación Física'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones-deportivas',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/federaciones',
         'federaciones deportivas'],
    
        ['https://www.paginasamarillas.com.gt/servicios/academias-de-ballet', 
         'ballet'],
    
        ['https://www.paginasamarillas.com.gt/servicios/bolos',
         'boliche'],
    
        ['https://www.paginasamarillas.com.gt/servicios/artes-marciales-ensenanza-de',
         'artes marciales'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=1',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=2',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=3',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/deportes?page=4',
         'deportes'],
    
        ['https://www.paginasamarillas.com.gt/servicios/entrenamiento-fisico',
         'entrenamiento fisico'],
        
        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-artes-marciales',
         'artes marciales'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-karate',
         'artes marciales'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-natacion',
         'natacion'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-vuelo-libre',
         'aviacion'],

        ['https://www.paginasamarillas.com.gt/servicios/escuelas-de-aviacion',
         'aviacion'],

        ['https://www.paginasamarillas.com.gt/servicios/golf',
         'golf'],

        ['https://www.paginasamarillas.com.gt/servicios/hipodromos',
         'hipodromos']
        ]

In [4]:
yellow = pd.DataFrame()
for url in lista_url:
    bucle = get_yellow_info(url[0],url[1])
    yellow = pd.concat([yellow,bucle],ignore_index=True)

In [5]:
yellow.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Full Contact Studio,Guatemala - Guatemala,2 C 10-06 Z-1,(+502) 2232 6575,,,Gimnasios de Educación Física
1,Club Atletico Atenas,Guatemala - Guatemala,Diagonal 14. 22-06 Zona 5,(+502) 2335 3680,http://www.clubatenas.com,,Gimnasios de Educación Física
2,Le Petit Spa,Guatemala - Guatemala,Dg 6 14-86 Z 10,(+502) 2366 7508,http://www.lepetitspagt.com,,Gimnasios de Educación Física
3,Madiel Spa,Guatemala - Guatemala,Av las Americas 7-30 Z 13 Edif Real América L ...,(+502) 2360 5070,http://www.madielspa.es.tl,,Gimnasios de Educación Física
4,Armonia Colonial,Guatemala - Guatemala,Boulevard Principal 23 Calle 18-96 Col. Villa ...,(+502) 2448 2553,,,Gimnasios de Educación Física


### Dance Academys

In [6]:
url = 'https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html'

In [7]:
def baile(url,category,web):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    names = [soup.find('h1').text]
    city = ['Guatemala - Guatemala']
    address = [soup.find('p',class_='address').text]
    phone = [soup.find('div',class_='schedule-box span_mobile12 span_tablet3 span3').text.strip().split('\n')[1]]
    description = [soup.find('div',class_='single-content').p.text.strip()]
    category = [category]
    web = [web]
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])
    
    

In [8]:
lista_url = [
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/scenic-dance-guatemala.html',
         'Baile','https://es-la.facebook.com/ScenicDance/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/salsa-latin-guatemala.html',
        'Baile','https://www.facebook.com/Salsa-Latin-Guatemala-178268719078/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-dance-art.html',
         'Baile','https://es-la.facebook.com/Danceartguatemala/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-bellydance-rashida.html',
        'Baile', 'https://es-la.facebook.com/Rashidaacademy/'],
        
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/addiction-dance-studio.html',
        'Baile','https://es-la.facebook.com/adsgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-cubana-de-baile-.html',
        'Baile','https://es-la.facebook.com/Escuela-Cubana-de-Baile-110562958135/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/in-motion-dance-fitness-guatemala.html',
        'Baile','http://www.inmotiongt.com/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/academia-de-ballet-danzarte.html',
        'Baile','https://es-la.facebook.com/Danzarte.ballet/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/ritmo-y-sabor-guatemala.html',
        'Baile','https://es-la.facebook.com/ritmoysaborgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/escuela-municipal-de-danza-clasica.html',
        'Baile','https://es-la.facebook.com/EMDCgt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/heroes-academy-guatemala-.html',
        'Baile','https://es-la.facebook.com/HeroesCompanyGT/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unidanza-guatemala-.html',
        'Baile','http://unidanza.net/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/unlimited-dance-academy.html',
        'Baile','https://unlimited-dance.com.gt/'],
    
        ['https://www.guatemala.com/guias/pasatiempos/academias-de-baile-en-guatemala/the-dance-factory.html',
        'Baile','https://es-la.facebook.com/DanceFactoryGT/']
    

        ]

In [9]:
df2=pd.DataFrame()
for url in lista_url:
    bucle = baile(url[0],url[1],url[2])
    df2 = pd.concat([df2,bucle],ignore_index=True)

In [10]:
df2.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Scenic Dance Guatemala,Guatemala - Guatemala,"3a. avenida, 14-19 zona 14, Ciudad de Guatemal...",2368-0858 | 2478-4688,https://es-la.facebook.com/ScenicDance/,¿Listo para aprender nuevos pasos? Scenic Danc...,Baile
1,Salsa Latin Guatemala,Guatemala - Guatemala,"18 calle 15-51, Zona 13 C.C. Paseo Real, Ciuda...",4217-2567 | 4008-6112,https://www.facebook.com/Salsa-Latin-Guatemala...,¡Atención amantes del baile! Salsa Latin Guate...,Baile
2,Academia Dance Art,Guatemala - Guatemala,"Bulevar Vista Hermosa 25-80, zona 15 Vista Her...",3093-8434,https://es-la.facebook.com/Danceartguatemala/,La Academia Dance Art ofrece profesionales exp...,Baile
3,Academia BellyDance Rashida,Guatemala - Guatemala,"Plaza Las Cañas, Local 201, zona 10, Ciudad de...",2337-0980,https://es-la.facebook.com/Rashidaacademy/,Esta academia de baile es ideal para las perso...,Baile
4,Addiction Dance Studio,Guatemala - Guatemala,"Galerías Primma Locales 203 y 204, Ciudad de G...",5466-3773,https://es-la.facebook.com/adsgt/,¿Con ganas de bailar? Addiction Dance Studio e...,Baile


### Crossfit

In [11]:
url = 'https://www.guatemala.com/deportes/crossfit/gimnasios-box-para-hacer-crossfit-guatemala.html'

In [12]:
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [13]:
names = [soup.find_all('h3')[i].text for i in range(len(soup.find_all('h3'))-2)]
names.insert(2,'CrossFit 502')
names.insert(4,'Crossfit FD6')
names.insert(9,'CrossFit Spring City Once')

In [14]:
address = [soup.find_all('ul')[i].li.text.strip() for i in range(2,len(soup.find_all('ul'))-3)]

In [15]:
address.insert(1,address[1].split('|')[0])
address[2] = address[2].split('|')[1]
address.insert(3,address[3].split('|')[0])
address[4] = address[4].split('|')[1]
address.insert(8,address[8].split('|')[0])
address[9] = address[9].split('|')[1]

In [16]:
phone = [soup.find_all('ul')[i].find_all('li')[1].text.strip().split('Contacto:')[1].split('Facebook')[0].strip() for i in range(2,len(soup.find_all('ul'))-3)]
phone.insert(1,phone[1].split('|')[0])
phone[2] = phone[2].split('|')[1]
phone.insert(3,phone[3].split('|')[0])
phone[4] = phone[4].split('|')[1]
phone.insert(8,phone[8].split('|')[0])
phone[9] = phone[9].split('|')[1]

In [17]:
soup.find_all('a', href=True)[72]['href']

'https://www.facebook.com/invictusperformance/'

In [18]:
web = [soup.find_all('a', href=True)[i]['href'] for i in range(72,79)]

In [19]:
web.insert(2,'https://www.facebook.com/CF502/')
web.insert(4,'https://www.facebook.com/d6lifechangingfitness/')
web.insert(9,'https://www.facebook.com/crossfitspringcity7/')

In [20]:
description = ['crossfit' for i in range(len(names))]
category = ['crossfit' for i in range(len(names))]
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [21]:
category = ['crossfit' for i in range(len(names))]

In [22]:
city = ['Guatemala - Guatemala' for i in range(len(names))]

In [23]:
crossfit = pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [24]:
def numero(x): #si la fecha tiene formaro de dd-mmm-aa devuelve únicamente el año
    if re.search(r"(\d)", x.lower()):
        ye = re.search(r"^\d?\d-\w\w\w-(\d\d\d?\d?)$", x.lower())
        x = ye.group(1)
        return x
    else:
        return x

In [25]:
crossfit.phone = crossfit.phone.apply(lambda x: re.search(r"(\d+\s\d+)",x).group(1))

In [26]:
crossfit.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Invictus Performance Training,Guatemala - Guatemala,"Ubicación: 8a. avenida 6-63, Zona 14",2367 1239,https://www.facebook.com/invictusperformance/,crossfit,crossfit
1,CrossFit 502,Guatemala - Guatemala,"Ubicación: Dinamia Cayalá, Zona 16",2368 5013,https://www.facebook.com/CF502/,crossfit,crossfit
2,CrossFit 502,Guatemala - Guatemala,"C.C. Escala, Carretera a El Salvador",6637 3778,https://www.facebook.com/CF502/,crossfit,crossfit
3,CrossFit FD6,Guatemala - Guatemala,"Ubicación: Plaza Futeca, Zona 14",2213 6900,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit
4,Crossfit FD6,Guatemala - Guatemala,"C.C. Miraflores, Zona 11",2225 3200,https://www.facebook.com/d6lifechangingfitness/,crossfit,crossfit


### Futeca

In [27]:
url = 'https://futecagym.com/sedes/'
response = requests.get(url).text

In [28]:
soup = BeautifulSoup(response,'lxml')

In [29]:
general = soup.find_all('div', class_='column_attr clearfix')

In [30]:
names= ['Futeca Gym - '+general[i].find_all('h4')[0].text for i in range(0,len(general),2) ]

In [31]:
address = [general[i].find_all('p')[0].text.split('Dirección')[1].strip() for i in range(0,len(general),2) ]

In [32]:
phone = [general[i].find_all('p')[1].text for i in range(0,len(general),2) ]

In [33]:
df = pd.DataFrame(zip(names,address,phone),columns=['names','address','phone'])

In [34]:
df['city'] = 'Guatemala - Guatemala'
df['category'] = 'Gimnasios de Educación Física'
df['web'] = 'https://futecagym.com/'
df['description'] = '''Futeca Gym, es un gimnasio que nace en el 2010. Enfocado en mejorar la calidad de vida de sus socios, a través del ejercicio, una buena alimentación y la creación de comunidades estrechas entre socios.'''

In [35]:
futeca = df[['names','city','address','phone','web','description','category']]

In [36]:
futeca.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Futeca Gym - Zona 14,Guatemala - Guatemala,"10ma. Avenida 10-50 zona 14. Plaza Futeca, Loc...",Teléfono: +502 22136900,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
1,Futeca Gym - Pradera Concepción,Guatemala - Guatemala,"Km. 15.5 Carretera a El Salvador, Finca Concep...",Teléfono: +502 66339866,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
2,Futeca Gym - Miraflores,Guatemala - Guatemala,"22 avenida, 7ma. calle zona 11, Paseo Miraflores",Teléfono: +502 22253200,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
3,Futeca Gym - San Cristóbal,Guatemala - Guatemala,Blvd. Principal 19-65 sector B1 San Cristóbal ...,Teléfono +502 24796470,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física
4,Futeca Gym - Naranjo,Guatemala - Guatemala,"23 calle 10-00 Zona 4 Condado Naranjo , local ...",Whatsapp: +502 410001310,https://futecagym.com/,"Futeca Gym, es un gimnasio que nace en el 2010...",Gimnasios de Educación Física


### Fitness One

In [37]:
url = 'https://fitnessone.com.gt/pradera/'
response = requests.get(url).text
soup = BeautifulSoup(response,'lxml')

In [38]:
general = soup.find_all('h1')

In [39]:
locations =['cayala','pradera','rus','sankrismall']
names=[]
city=[]
address =[]
phone =[]
for location in locations:
    url = 'https://fitnessone.com.gt/{}/'.format(location)
    response = requests.get(url).text
    soup = BeautifulSoup(response,'lxml')
    general = soup.find_all('h1')
    names.append('Fitness One - '+general[1].text)
    city.append('Guatemala-Guatemala')
    address.append(soup.find_all('div',class_='et_pb_text_inner')[4].find_all('p')[0].text)
    phone.append(soup.find_all('div',class_='et_pb_text_inner')[4].find_all('p')[1].text)

In [40]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [41]:
df['city'] = 'Guatemala - Guatemala'
df['category'] = 'Gimnasios de Educación Física'
df['web'] = 'https://fitnessone.com.gt/'
df['description'] = 'Todo lo que necesitas para transformar tu vida y tu cuerpo, las mejores ubicaciones, personal capacitado para hacer del ejercicio parte de tu vida.'
fitnessone = df[['names','city','address','phone','web','description','category']]

In [42]:
fitnessone.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Fitness One - Cayalá,Guatemala - Guatemala,Diagonal 35 Boulevard Austriaco 16-25 Zona 16 ...,+502 2491-4333,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
1,Fitness One - Pradera,Guatemala - Guatemala,Boulevar Los Próceres 25-74 Zona 10,+502 2423-6000,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
2,Fitness One - RUS,Guatemala - Guatemala,Calzada Roosevelt 12-76 zona 7 C.C. RUS,+502 2226-2226,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física
3,Fitness One - Sankris Mall,Guatemala - Guatemala,3ra. Calle Sector A-3 Boulevard San Cristóbal ...,+502 2424-4848,https://fitnessone.com.gt/,Todo lo que necesitas para transformar tu vida...,Gimnasios de Educación Física


### Scandinavia

In [43]:
url = 'https://www.scandinaviagym.com/#gimnasios'
#response = requests.get(url).text

In [44]:
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [45]:
general = soup.find_all('h3')

In [46]:
names = ['Scandinavia - '+name.text for name in general]

In [47]:
address = [ad.h4.text for ad in soup.find_all('div',class_='team-detail')]

In [48]:
phone= [phone.text for phone in soup.find_all('b')]

In [49]:
phone = [tel for tel in phone if tel!='']

In [50]:
phone[0] = 'Teléfonos: 2459-2586 y 2377-3013'
phone[3] = 'Teléfonos: 2269 5605'
phone[5] = 'Teléfonos: 3502 1704'
phone[6] = 'Teléfonos: 7832 6763'
phone[8] = 'Teléfonos: 2478 1566'
phone[9] = 'Teléfonos: 2212 7513'
phone[10] = 'Teléfonos: 2474 2815'
phone[12] = 'Teléfonos: 2335 7683'
phone[-3] = 'Teléfonos: 2211 0046'

In [51]:
city = ['Guatemala - Guatemala' for i in range(len(phone))]

In [52]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [53]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.scandinaviagym.com/'
df['description'] = 'Cadena de 14 gimnasios. Siempre Cerca de Ti. Amplios Horarios. En las principales ciudades de Guatemala.'
scandinavia = df[['names','city','address','phone','web','description','category']]

In [54]:
scandinavia.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Scandinavia - Aguilar Batres,Guatemala - Guatemala,"Calzada Aguilar Batres14-07, Zona 12",Teléfonos: 2459-2586 y 2377-3013,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
1,Scandinavia - Vista Hermosa,Guatemala - Guatemala,Boulevard Vista Hermosa 21-95 zona 15,Teléfonos: 2277-9296 y 2369-1018,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
2,Scandinavia - Carretera a El Salvador,Guatemala - Guatemala,"Km. 16.5, Carretera a El Salvador Salida a San...",Teléfonos: 2277-9297 y 6634-0633,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
3,Scandinavia - Metronorte,Guatemala - Guatemala,Centro Comercial Metronorte,Teléfonos: 2269 5605,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física
4,Scandinavia - San Rafael,Guatemala - Guatemala,"km. 7.5 Carretera a El Atlántico zona 18, c.c....",Teléfonos: 2267 2919,https://www.scandinaviagym.com/,Cadena de 14 gimnasios. Siempre Cerca de Ti. A...,Gimnasios de Educación Física


### Smart Fit

In [55]:
url = 'https://www.smartfit.com.gt/gimnasios'
response = requests.get(url).text
soup = BeautifulSoup(response,'lxml')

In [56]:
names = ['Smart Fit - '+soup.find_all('h3')[name].text for name in range(1,len(soup.find_all('h3'))-1)]

In [57]:
dires = soup.find_all('p',class_='Text')
address = [dires[add].text.split('-',1)[1] for add in range(len(dires))]

In [58]:
phone = [np.nan for i in range(len(address))]

In [59]:
city = ['Guatemala - Guatemala' for i in range(len(phone))]

In [60]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [61]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.smartfit.com.gt/'
df['description'] = 'Inaugurada en 2009, Smart Fit fue creada con el propósito de democratizar el acceso a la práctica de actividad física de alto nivel, con planes accesibles y adhesión facilitada.'
smartfit = df[['names','city','address','phone','web','description','category']]

In [62]:
smartfit.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Smart Fit - El Naranjo,Guatemala - Guatemala,"Plaza Kalú, locales R2-R5 - 23 calle 14-50 Zo...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
1,Smart Fit - Tikal Futura,Guatemala - Guatemala,Calzada Roosevelt 23-43 - Ciudad de Guatemal...,,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
2,Smart Fit - QUO Zona 4,Guatemala - Guatemala,"Via 7, 4-20, Zona 4, Torre Q, Segundo Nivel, ...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
3,Smart Fit - Los Próceres,Guatemala - Guatemala,"3ª Avenida 16-52, Z.10 - Centro Comercial L...",,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física
4,Smart Fit - Rambla,Guatemala - Guatemala,Centro Comercial Rambla 10 - Boulevar Los Pró...,,https://www.smartfit.com.gt/,"Inaugurada en 2009, Smart Fit fue creada con e...",Gimnasios de Educación Física


### Poltec 

In [63]:
url = 'http://www.poltecgym.com/'
#response = requests.get(url).text
#soup = BeautifulSoup(response,'lxml')

In [64]:
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [65]:
nombres = soup.find_all('h3')

names = ['Poltec - '+nombres[i].text.strip() for i in range(4,len(nombres)) ]

In [66]:
tel = soup.find_all('address')
phone = [tel[i].text.strip().split('T.')[1].split('\n')[0] for i in range(1,len(tel)-1) ]

In [67]:
address = [i for i in range(6)]
address[0]='Km. 5 Carretera al Atlántico Zona 17, Guatemala Guatemala'
address[1]='Ruta al Atlántico CC. Plaza San Rafael Zona 18 Guatemala'
address[2]='45 Calle 19-40, zona 12, C. C. Gran Portal Petapa '
address[3]='Calzada Roosevelt 13-88'
address[4]='Km. 17.5 Pacifico, Terreno #6, C.C. Santa Clara, Villa Nueva., Guatemala 01011, Guatemala'
address[5]='Dirección: Km. 17.9 Carretera a San Jose Pinula, Centro Comercial Local 14, El Faro'

In [68]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [69]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'http://www.poltecgym.com/'
df['description'] = 'El ejercicio es parte importante de un estilo de vida saludable. El ejercicio previene los problemas de salud, desarrolla resistencia, brinda más energía y puede ayudar a reducir el estrés.'
poltec = df[['names','city','address','phone','web','description','category']]


In [70]:
poltec.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Poltec - CC METRONORTE,Guatemala - Guatemala,"Km. 5 Carretera al Atlántico Zona 17, Guatemal...",2256-0882,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
1,Poltec - CC PLAZA SAN RAFAEL,Guatemala - Guatemala,Ruta al Atlántico CC. Plaza San Rafael Zona 18...,2261-6391,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
2,Poltec - CC GRAN PORTAL PETAPA,Guatemala - Guatemala,"45 Calle 19-40, zona 12, C. C. Gran Portal Pet...",2460-3185,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
3,Poltec - CC ASIA MALL (ROOSEVELT),Guatemala - Guatemala,Calzada Roosevelt 13-88,24743214,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física
4,Poltec - CC SANTA CLARA,Guatemala - Guatemala,"Km. 17.5 Pacifico, Terreno #6, C.C. Santa Clar...",6644-8279,http://www.poltecgym.com/,El ejercicio es parte importante de un estilo ...,Gimnasios de Educación Física


### Curves

In [71]:
url = 'https://www.curveslatinoamerica.com/guatemala.html#'
driver = webdriver.Chrome()

driver.get(url)

resp = driver.execute_script("return document.documentElement.outerHTML")

driver.quit()

soup = BeautifulSoup(resp,'lxml')

In [72]:
general = soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 col-lg-6')

In [73]:
names = ['Curves - '+general[i].find('p').text.strip() for i in range(len(general)) ]

In [74]:
address = [general[i].find('small').text.strip().split('\n')[0] for i in range(len(general)) ]

In [75]:
phone = [general[i].find('small').text.strip().split('\n')[1].strip() for i in range(len(general)) ]

In [76]:
city = ['Guatemala - Guatemala', 'Quetzaltenango']

In [77]:
df = pd.DataFrame(zip(names,city,address,phone),columns=['names','city','address','phone'])

In [78]:
df['category'] = 'Gimnasios de Educación Física'
df['city'] = 'Guatemala - Guatemala'
df['web'] = 'https://www.curveslatinoamerica.com/guatemala.html#'
df['description'] = 'Curves es un gimnasio para mujeres, la franquicia ofrece un programa de perdida de peso saludable y razonable.'
curves = df[['names','city','address','phone','web','description','category']]

In [79]:
curves

Unnamed: 0,names,city,address,phone,web,description,category
0,Curves - Reforma Zona 15,Guatemala - Guatemala,1a calle 22-13 zona 15 Vista Hermosa II C.P. 1...,Teléfonos: (502) 2369 2622 | (502) 2369 2644 |...,https://www.curveslatinoamerica.com/guatemala....,"Curves es un gimnasio para mujeres, la franqui...",Gimnasios de Educación Física
1,Curves - Quetzaltenango,Guatemala - Guatemala,"Avenida 9-66 Zona 3, Plaza Delco C.P. 9001",Teléfono: (502) 7763 6831,https://www.curveslatinoamerica.com/guatemala....,"Curves es un gimnasio para mujeres, la franqui...",Gimnasios de Educación Física


### Dinamic Fitness

In [80]:
dinamic_fitness_url= ['https://dinamic.com.gt/naranjo/',
                      'https://dinamic.com.gt/forum-zona-10/',
                      'https://dinamic.com.gt/san-juan/',
                      'https://dinamic.com.gt/villa-hermosa/',
                      'https://dinamic.com.gt/portales/',
                      'https://dinamic.com.gt/roosevelt/']

In [81]:
def dinamic_fitness(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    names = ['Dinamic Fitness - ' + soup.find('h2').text]
    city = ['Guatemala - Guatemala']
    
    if len(soup.find_all('h3'))<=2:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[1]
    elif len(soup.find_all('h3'))==4:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[3]
    else:
        address = soup.find_all('h3')[0]
        phone = soup.find_all('h3')[2]
        
    web = ['https://dinamic.com.gt/']
    description = ['Somos un nuevo tipo de gym, un lugar para ponerte en forma. ¡Un lugar para ser feliz!Nuestros programas de ejercicios están diseñados bajo una planificación de entrenamiento cardiovascular y de fuerza. Todo con un soporte científico que garantiza que obtendrás resultados de forma segura.']
    category = ['Gimnasios de Educación Física']
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [82]:
dinamicfitness=pd.DataFrame()
for url in dinamic_fitness_url:
    bucle = dinamic_fitness(url)
    dinamicfitness = pd.concat([dinamicfitness,bucle],ignore_index=True)

In [83]:
dinamicfitness.phone[0] = np.nan

In [84]:
dinamicfitness.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,Dinamic Fitness - PASAJE NARANJO,Guatemala - Guatemala,10 avenida 18-58 zona 4 de mixco Condado Naran...,,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
1,Dinamic Fitness - FORUM ZONA 10,Guatemala - Guatemala,3ra ave 10-80 Zona 10 Edificio Forum Zona Viva...,TEL: 2214-0707,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
2,Dinamic Fitness - CENTRO 21,Guatemala - Guatemala,[CALZADA SAN JUAN 21-14 ],[TEL: 2243-0003],https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
3,Dinamic Fitness - VILLA HERMOSA,Guatemala - Guatemala,CENTRO COMERCIAL PACIFIC VILLA HERMOSA,TEL: 2214-0505,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física
4,Dinamic Fitness - ZONA PORTALES,Guatemala - Guatemala,3ra. Av. 10-10 Zona 17 km. 4.5 Ruta al Atlanti...,TEL: 2217-1111,https://dinamic.com.gt/,"Somos un nuevo tipo de gym, un lugar para pone...",Gimnasios de Educación Física


### GT 

In [86]:
paginas = [i for i in range(1,12)]

In [87]:
h2s=[]
for p in paginas:
    response = requests.get('https://directorio.guatemala.com/listado/guia/gimnasios/pagina/{}'.format(p))
    h2s.append(BeautifulSoup(response.text,'lxml').find_all('h2'))

In [88]:
directorio = []
for h2 in h2s:
    for h in h2:
        directorio.append(h.text.strip())

In [89]:
directorio = set(directorio)
directorio = list(directorio)

In [90]:
slug = [slugify(i) for i in directorio]

In [91]:
url = 'https://directorio.guatemala.com/listado/{}.html'.format(slug[0])
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')

In [92]:
def gtcom(url):
    malos=[]
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')
    business = soup.find('div',class_='content-info-business')
    try:
        names = [business.find('h1').text]
    except:
        names = [np.nan]
        malos.append(sl)
    try:
        address = [business.find('span').text]
        city = ['Guatemala - Guatemala']
    except:
        address =[np.nan]
        city=[np.nan]
    try:
        phone = [soup.find('div',class_='business-buttons').find('a',href=True)['href']]
    except:
        phone=[np.nan]
    try:
        web = [soup.find('div',class_='business-buttons').find_all('a',href=True)[2]['href']]
    except:
        web=[np.nan]
    try:
        description = [soup.find('div',class_='info-description').text.strip().split('Sobre la empresa\n')[1].strip()]
    except:
        description =[np.nan]
    try:
        category = [soup.find('div',class_='business-categories').text.strip().split('Listado en: \n\n')[1].strip()]
    except:
        category=[np.nan]
    
    return pd.DataFrame(zip(names, city, address,phone,web,description,category),
                              columns=['names','city','address','phone','web','description','category'])

In [102]:
gt = pd.DataFrame()
with tqdm(total=100) as pbar:
    for sl in slug:
        bucle = gtcom('https://directorio.guatemala.com/listado/{}.html'.format(sl))

        gt = pd.concat([gt,bucle],ignore_index=True)
        pbar.update(100/len(slug))


  full_bar = Bar(
100%|██████████| 100.00000000000001/100 [00:54<00:00,  1.82it/s]


In [94]:
gt.head()

Unnamed: 0,names,city,address,phone,web,description,category
0,World Gym Pradera,Guatemala - Guatemala,Blvd. Los Próceres 25-74 Zona 10,tel:+(502) 2423-6000,https://directorio.guatemala.com/listing_repor...,En World Gym contamos con un centro de orienta...,Pesas
1,Akasha Yoga Centro Holístico,Guatemala - Guatemala,8a. Calle 28-00 Zona 11,tel:+(502) 2473-7844,https://directorio.guatemala.com/listing_repor...,"Akasha Yoga Centro Holístico, abre las puertas...",Yoga\n \n\n...
2,Scandinavia Gym Vista Hermosa,Guatemala - Guatemala,"Blvd. Vista Hermosa, 21-95 Zona 15",tel:+(502) 2369-1018,https://directorio.guatemala.com/listing_repor...,Scandinavia Gym inicó en 1992 atentiendo a la ...,Pesas\n \n\...
3,Zardoz Fitness,Guatemala - Guatemala,16 Calle final 41-85 Zona 5,tel:+(502) 2336-0091,https://directorio.guatemala.com/listing_repor...,Zardoz Fitness es un centro de entrenamiento e...,Pesas
4,,,,,,,


In [103]:
gymy = pd.read_csv('../../gymy_V3 - gymy_V3.csv')
gymy.head(1)

Unnamed: 0,id,names,address,area code,phone,web,description,category,city,link,openstreetmap
0,1,180 Grados Fitness,Km. 22.5 Carretera a El Salvador. Portal del B...,502.0,6662-5087,https://www.180gradosfitness.com/,180 Grados Fitness tiene como objetivo fundame...,"Crossfit, Funcional, Yoga, Pilates",Guatemala - Guatemala,https://www.google.com.ar/maps/place/180+grado...,"geo:14.49796,-90.48129?z=19"


In [106]:
gymy['lat'] = gymy.openstreetmap.apply(lambda x : float(re.search(r'(\d+\.\d+),(-?\d+\.\d+)',x).group(1)))
gymy['long'] = gymy.openstreetmap.apply(lambda x : float(re.search(r'(\d+\.\d+),(-?\d+\.\d+)',x).group(2)))
gymy['latlong'] = tuple(list(zip((gymy.lat), (gymy.long))))


In [126]:
gymy.phone = gymy.phone.str.replace('-','')
gymy.phone = gymy.phone.str.replace(' ','')
gymy.phone = gymy.phone.str.strip()
gymy.phone.fillna('0', inplace=True)


In [212]:
gymy.category = gymy.category.str.replace('Natacion','Natación')

In [213]:
gymy.category = gymy.category.str.strip()

In [214]:
gymy.phone.astype('int32')


0      66625087
1      23370980
2      30938434
3      40445174
4      46626874
         ...   
155    23798195
156    23758195
157    24769841
158    22961976
159    23360091
Name: phone, Length: 160, dtype: int32

In [215]:
gymy['area code'].fillna('0', inplace=True)
gymy['area code'].astype('int32')

0      502
1      502
2      502
3      502
4      502
      ... 
155    502
156    502
157    502
158    502
159    502
Name: area code, Length: 160, dtype: int32

In [216]:
type(gymy.latlong[0])

tuple

In [217]:
gymy.latlong[0][0]

14.49796

In [218]:
gymy.category[gymy.names == 'Bilanz'] = 'Pilates, TRX'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gymy.category[gymy.names == 'Bilanz'] = 'Pilates, TRX'


In [234]:
gymy.drop(34, inplace=True)

In [235]:
gymy[gymy.category == 'Gimnasios de Educación Física']

Unnamed: 0,id,names,address,area code,phone,web,description,category,city,link,openstreetmap,lat,long,latlong


In [236]:
gymy.to_csv('/Users/EstebanCardona/Documents/gymy/csv/gymy_final.csv')

In [237]:
categorias = gymy.category.unique()

In [238]:
type(categorias)

numpy.ndarray

In [239]:
cat = categorias.tolist()

In [240]:
cat_ind = [item for sublist in cat for item in sublist]
listas = [elemento.split(', ') for elemento in cat]


In [241]:
cat_ind = []
for elemento in listas:
    for nombre in elemento:
        cat_ind.append(nombre)

In [242]:
print(cat_ind)

['Crossfit', 'Funcional', 'Yoga', 'Pilates', 'Baile', 'Funcional', 'Yoga', 'Pilates', 'TRX', 'Baile', 'Spinning', 'Yoga', 'Yoga', 'Funcional', 'Crossfit', 'Artes Marciales', 'Gimnasia', 'Natación', 'Pentatlon moderno', 'Aviacion', 'Escalada', 'Pilates', 'TRX', 'Billar', 'Pilates', 'Yoga', 'Zumba', 'Yoga', 'Zumba', 'Pilates', 'Pesas', 'Pesas', 'Baile', 'Tenis', 'Ajedrez', 'Baloncesto', 'Beisbol', 'Boxeo', 'Ciclismo', 'Esgrima', 'Remo', 'Voleibol', 'Tenis de mesa', 'Pesas', 'Natación', 'Pole Fitness', 'Pesas', 'Funcional', 'Baile', 'Crossfit', 'Pesas', 'Funcional', 'Baile', 'Crossfit', 'Natación', 'Golf', 'Boliche', 'Calistenia', 'Futbol', 'Pesas', 'Baile', 'Natación', 'Funcional', 'Natación', 'Pesas', 'Funcional', 'Pesas', 'Pilates', 'Funcional', 'Pesas', 'Crossfit']


In [243]:
cat_ind = set(cat_ind)

In [244]:
cat_ind = list(cat_ind)

In [245]:
cat_ind.sort()
cat_ind

['Ajedrez',
 'Artes Marciales',
 'Aviacion',
 'Baile',
 'Baloncesto',
 'Beisbol',
 'Billar',
 'Boliche',
 'Boxeo',
 'Calistenia',
 'Ciclismo',
 'Crossfit',
 'Escalada',
 'Esgrima',
 'Funcional',
 'Futbol',
 'Gimnasia',
 'Golf',
 'Natación',
 'Pentatlon moderno',
 'Pesas',
 'Pilates',
 'Pole Fitness',
 'Remo',
 'Spinning',
 'TRX',
 'Tenis',
 'Tenis de mesa',
 'Voleibol',
 'Yoga',
 'Zumba']

In [246]:
cats = ['Ajedrez', 'Artes Marciales', 'Aviacion', 'Baile', 'Baloncesto', 'Beisbol', 'Billar', 'Boliche',
        'Boxeo', 'Calistenia', 'Ciclismo', 'Crossfit', 'Escalada', 'Esgrima', 'Funcional', 'Futbol',
        'Gimnasia', 'Golf', 'Natación', 'Pentatlon moderno', 'Pesas', 'Pilates', 'Pole Fitness', 'Remo', 
        'Spinning', 'TRX', 'Tenis', 'Tenis de mesa', 'Voleibol', 'Yoga', 'Zumba']

In [251]:
string.split(', ') in cats

False

In [262]:
print(string.split(', ')[4])

IndexError: list index out of range

In [263]:
string

'Crossfit, Funcional, Yoga, Pilates'

In [285]:
string = 'Crossfit, Funcional, Yoga, Pilates, Pesas'
pesas = ['Pesas']



In [286]:
for s in string.split(', '):
    if s in pesas:
        print(s)

Pesas


In [307]:
def filtro(string,user_input):
    for s in string.split(', '):
        if s in user_input:
            return True
    return False

In [308]:
filtro(string,pesas)

True

In [309]:
user = 'Pesas'

In [310]:
gymy['filtro'] = gymy.category.apply(filtro, user_input=user)

In [311]:
gymy

Unnamed: 0,id,names,address,area code,phone,web,description,category,city,link,openstreetmap,lat,long,latlong,filtro
0,1,180 Grados Fitness,Km. 22.5 Carretera a El Salvador. Portal del B...,502,66625087,https://www.180gradosfitness.com/,180 Grados Fitness tiene como objetivo fundame...,"Crossfit, Funcional, Yoga, Pilates",Guatemala - Guatemala,https://www.google.com.ar/maps/place/180+grado...,"geo:14.49796,-90.48129?z=19",14.49796,-90.48129,"(14.49796, -90.48129)",False
1,2,Academia BellyDance Rashida,"Plaza Las Cañas, Local 201, zona 10",502,23370980,https://es-la.facebook.com/Rashidaacademy/,Esta academia de baile es ideal para las perso...,Baile,Guatemala - Guatemala,https://www.google.com.ar/maps/place/Academia+...,"geo:14.58151,-90.49617?z=19",14.58151,-90.49617,"(14.58151, -90.49617)",False
2,3,Academia Dance Art,"Bulevar Vista Hermosa 25-80, zona 15 Vista Her...",502,30938434,https://es-la.facebook.com/Danceartguatemala/,La Academia Dance Art ofrece profesionales exp...,Baile,Guatemala - Guatemala,https://www.google.com.ar/maps/place/edificio+...,"geo:14.58096,-90.48487?z=19",14.58096,-90.48487,"(14.58096, -90.48487)",False
3,4,Academia de Baile Español Mary Farrington,19 calle 12-34 zona 10,502,40445174,https://www.facebook.com/acamaryfarrrington/,Academia de Baile Español Mary Farrington te i...,Baile,Guatemala - Guatemala,https://www.google.com/maps/place/19+Calle+123...,"geo:14.59010,-90.50806?z=19",14.59010,-90.50806,"(14.5901, -90.50806)",False
4,5,Academia de Ballet DanzArte,Avenida Elena 3-31 zona 1,502,46626874,http://academia-danzarte.com/,La Academia de Ballet DanzArte cuenta con tres...,Baile,Guatemala - Guatemala,https://www.google.com/maps/place/Avenida+Elen...,"geo:14.64664,-90.52080?z=17",14.64664,-90.52080,"(14.64664, -90.5208)",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,156,Waterproof Swim Academy,"12 calle 16-02 zona 16, Blvd San Isidro, Acatán",502,23798195,https://www.giselamorales.com.gt,Natacion,Natación,,https://www.google.com.ar/maps/place/Waterproo...,"geo:14.61550,-90.46963?z=18",14.61550,-90.46963,"(14.6155, -90.46963)",False
156,157,Waterproof Swim Academy,"15 avenida A y 3 calle D, zona 8, San Cristóbal",502,23758195,https://www.giselamorales.com.gt,Natacion,Natación,,,"geo:14.58914,-90.59433?z=18",14.58914,-90.59433,"(14.58914, -90.59433)",False
157,158,Xsport Fitness Center,"Calz. Aguilar Batres, 37-95 Zona 12",502,24769841,https://www.facebook.com/xsportfitnesscenter,Ofrecemos servicio de gimnasio completo. Conta...,"Pesas, Crossfit",Guatemala - Guatemala,https://www.google.com.ar/maps/place/Xsport+Fi...,"geo:14.58745,-90.56182?z=17",14.58745,-90.56182,"(14.58745, -90.56182)",True
158,159,Xtream Gym,6a. Avenida 6-63 Zona 10,502,22961976,https://www.facebook.com/Xtreamgymguate,Nuestro gimnasio ofrece un entrenamiento por m...,Artes Marciales,Guatemala - Guatemala,https://www.google.com.ar/maps/place/XTREAM+GY...,"geo:14.60688,-90.50995?z=18",14.60688,-90.50995,"(14.60688, -90.50995)",False
