In [None]:
import requests
import pandas as pd
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import NoSuchElementException
import time

In [None]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [None]:
def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [None]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [None]:
def getCategory():
    category = []
    text = []

    page1 = simple_get('https://www.indonetwork.co.id/categories')
    soup1 = BeautifulSoup(page1, 'html.parser')
    findCategory = soup1.find_all('div', class_='sub1cat')

    for a in findCategory:
        href = a.find('a', href=True)
        category.append(href['href'])
        text.append(href.text)
        
    return category,text


In [None]:
def filterLinkProvince(category):
    province = []
    checked = True
    
    categoryPage = simple_get(category+ '/perusahaan')
    while(checked):
        try:
            checked = False
            soup2 = BeautifulSoup(categoryPage, 'html.parser')
            divProv = soup2.find('div', class_='filkat-sub')
            temp = divProv.find_all('a', href=True)
        except Exception:
            checked = True

    for a in temp:

        province.append(a['href'])
    
    
    return province

In [None]:
def filterTextProvince(category):
    textProv = []
    
    categoryPage = simple_get(category + '/perusahaan')
    soup2 = BeautifulSoup(categoryPage, 'html.parser')
    divProv = soup2.find('div', class_='filkat-sub')
    temp = divProv.find_all('a', href=True)

    for a in temp:

        textProv.append(a.text)
    
    return textProv

In [None]:
def getLinkCompany(province):
    
    linkCompany = []
    checked = True
    
    for page in range(1,10):
        
        if page == 1:
            url = province
        else:
            url = province + '?page=' + str(page)
    
        try:
            perusahaanPage = simple_get(url)
            soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
            listCompany = soup3.find_all('div', class_='list-item-company')
        except Exception:
            try:
                perusahaanPage = simple_get(url)
                soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
                listCompany = soup3.find_all('div', class_='list-item-company')
            except Exception:
                print('error')

        for i in listCompany:
            productInfo = i.find('a', class_='link_product')
            if productInfo == None:
                continue
            else :
                linkCompany.append('https:'+ productInfo['href'])

    return linkCompany

In [None]:
def getLinkWhatsapp(url):
    
    linkWa = []
    
    options = Options()
    options.headless = True
    driver = webdriver.Chrome('/Users/abdulsalam/Documents/dll/chromedriver', chrome_options=options)
    driver.get(url)
    try:
        driver.find_element_by_class_name('wa-call').click()
    except NoSuchElementException as exception:
        return None
   
    time.sleep(1)
    
    page_source = driver.page_source
    soup4 = BeautifulSoup(page_source, 'html.parser').find_all('a', class_='nobor')
    
    for i in soup4:
        linkWa.append(i['href'])
    
    return linkWa

In [None]:
def getPhoneWA(url):
    try:
        whatsapp = simple_get(url)
        soup1 = BeautifulSoup(whatsapp, 'html.parser')
        findNumber = soup1.find('span', class_='').text
    except Exception:
        try:
            whatsapp = simple_get(url)
            soup1 = BeautifulSoup(whatsapp, 'html.parser')
            findNumber = soup1.find('span', class_='').text
        except Exception:
            try:
                whatsapp = simple_get(url)
                soup1 = BeautifulSoup(whatsapp, 'html.parser')
                findNumber = soup1.find('span', class_='').text
            except Exception:
                return None
    
    return findNumber

In [None]:
category = getCategory()
category[1]

In [95]:
linkProv = filterLinkProvince(category[0][195])
textProv = filterTextProvince(category[0][195])

textProv

['Kepulauan Riau',
 'Sulawesi Tengah',
 'Kalimantan Timur',
 'Bengkulu',
 'Sumatera Barat',
 'Banten',
 'Bali',
 'Sumatera Utara',
 'Jawa Tengah',
 'Sulawesi Utara',
 'Papua',
 'Sulawesi Selatan',
 'Jawa Barat',
 'Jawa Timur',
 'Nusa Tenggara Barat',
 'Daerah Istimewa Yogyakarta',
 'Sulawesi Tenggara',
 'Lampung',
 'DKI Jakarta',
 'Kalimantan Selatan']

In [96]:
linkProv

['https://www.indonetwork.co.id/kepulauan-riau/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-tengah/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/kalimantan-timur/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/bengkulu/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/sumatera-barat/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/banten/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/bali/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/sumatera-utara/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/jawa-tengah/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-utara/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/papua/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-selatan/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/jawa-barat/kemasan-plastik/perusahaan',
 'https://www.indonetwork.co.id/jawa-timur/kema

In [None]:
linkCompany = {}
for i in range(len(textProv)):
    linkCompany[textProv[i]] = getLinkCompany(linkProv[i])

for j in textProv:
    for k in linkCompany[j]:
        
        print(k)


In [91]:
linkCompany['DKI Jakarta']

['https://www.indonetwork.co.id/company/minaalkaliwater',
 'https://www.indonetwork.co.id/company/indonesiabersihrapi',
 'https://www.indonetwork.co.id/company/cv_jayasantikahgroup',
 'https://www.indonetwork.co.id/company/buana-jaya',
 'https://www.indonetwork.co.id/company/teknik-mandiri',
 'https://www.indonetwork.co.id/company/kkkkkkk',
 'https://www.indonetwork.co.id/company/metrixinspira',
 'https://www.indonetwork.co.id/company/andalbangunsejahtera',
 'https://www.indonetwork.co.id/company/mega_promosindo']

In [92]:
data_company = []
phone_wa = []
phone_temp = []

for j in textProv:
    for k in linkCompany[j]:
        company = simple_get(k)
        try:
            soup = BeautifulSoup(company, 'html.parser')
        except Exception:
            try:
                soup = BeautifulSoup(company, 'html.parser')
            except Exception:
                try:
                    soup = BeautifulSoup(company, 'html.parser')
                except Exception:
                    print('error')
                    continue
        companyName = soup.find('h1', class_='sc-company__title').text
        membershipLvl = soup.find('span', class_='sc-company__lb').text
        companyCtgry = category[1][195]
        companyDesc = soup.find('div', class_='rc-company__description').text
        companyAddr = soup.find('address').text
        companyCity = soup.find('span', class_='text-capitalize').text
        companyProv = j
        linkWA = getLinkWhatsapp(k)
        if linkWA is not None:
            for wa in linkWA:
                phone_temp.append(getPhoneWA(wa))
            if len(phone_temp) == 1:
                phone_temp.append('None')
                phone_temp.append('None')
            elif len(phone_temp) == 2:
                phone_temp.append('None')
            elif len(phone_temp) == 4:
                del phone_temp[3]
            elif len(phone_temp) == 5:
                del phone_temp[3]
                del phone_temp[4]
            elif len(phone_temp) == 6:
                del phone_temp[3]
                del phone_temp[4]
                del phone_temp[5]
        else:
            phone_temp.append('None')
            phone_temp.append('None')
            phone_temp.append('None')
        
        data_company.append({
            'Name':companyName,
            'Membership':membershipLvl,
            'Category':companyCtgry,
            'Description':companyDesc,
            'Address':companyAddr,
            'City':companyCity,
            'Province':companyProv
        })
        
        phone_wa.append(phone_temp)
        
        print(companyName)
        print(companyCity)
        print(companyProv)
        print(phone_temp)
        print('')
        phone_temp = []

zeropromosi
Tangerang
Banten
['087781857176', '087781867176', '081808064176']

PT. Hanja Prima Loka
Tangerang
Banten
['08558800556', 'None', 'None']

PT. Haliplast Century Indonesia
Jakarta
Banten
['+62 815-8812-741', 'None', 'None']

PT PUTRA ANDALAN JAYA
Tangerang
Banten
['+62 878-7883-6908', 'None', 'None']

Dunia Rak Minimarket
Tangerang
Banten
['+62 821-1364-7899', 'None', 'None']

PT. ANUGRAH PUTRA KENCANA
Bekasi
Jawa Barat
['+62 821-1399-3562', '+62 877-4142-8040', '+62 896-0555-3944']

CV.BINTANG BOTOL PLASTINDO
bogor
Jawa Barat
['+62 816-533-839', '+62 857-2000-0439', 'None']

Omah Dodolan Fiafi
Bekasi
Jawa Barat
['085889688462', 'None', 'None']

CV.changdong Indonesia
Bekasi
Jawa Barat
['+62 852-8793-8886', 'None', 'None']

CV. HERRY JAYA UTAMA
Depok
Jawa Barat
['+62 812-1226-5508', 'None', 'None']

CV Pratama Sains Global
BANDUNG BARAT
Jawa Barat
['None', 'None', 'None']

PT. Hinoka Alsindo
Bekasi
Jawa Barat
['+62 812-1208-6288', '+62 812-9412-5622', 'None']

Graha Mulia Tek

In [93]:
tempData = data_company
for index in range(len(data_company)):
    for number in range(len(phone_wa[index])):
        tempData[index]['telp' + str(number+1)] = phone_wa[index][number]


In [94]:
df = pd.DataFrame(tempData)

df.to_csv('data_company196.csv', index=False, encoding="utf-8")

### LOOPING ###

In [None]:
z = 

for i in range(1, 6):
    linkProv = filterLinkProvince(category[0][z+ i])
    textProv = filterTextProvince(category[0][z+ i])
    
    linkCompany = {}
    
    for x in range(len(textProv)):
        linkCompany[textProv[x]] = getLinkCompany(linkProv[x])

    for m in textProv:
        for l in linkCompany[m]:

            print(l)
            
    data_company = []
    phone_wa = []
    phone_temp = []

    for j in textProv:
        for k in linkCompany[j]:
            company = simple_get(k)
            try:
                soup = BeautifulSoup(company, 'html.parser')
            except Exception:
                try:
                    soup = BeautifulSoup(company, 'html.parser')
                except Exception:
                    try:
                        soup = BeautifulSoup(company, 'html.parser')
                    except Exception:
                        print('error')
                        continue
            companyName = soup.find('h1', class_='sc-company__title').text
            membershipLvl = soup.find('span', class_='sc-company__lb').text
            companyCtgry = category[1][z + i]
            companyDesc = soup.find('div', class_='rc-company__description').text
            companyAddr = soup.find('address').text
            companyCity = soup.find('span', class_='text-capitalize').text
            companyProv = j
            linkWA = getLinkWhatsapp(k)
            if linkWA is not None:
                for wa in linkWA:
                    phone_temp.append(getPhoneWA(wa))
                if len(phone_temp) == 1:
                    phone_temp.append('None')
                    phone_temp.append('None')
                elif len(phone_temp) == 2:
                    phone_temp.append('None')
                elif len(phone_temp) == 4:
                    del phone_temp[3]
                elif len(phone_temp) == 5:
                    del phone_temp[3]
                    del phone_temp[4]
                elif len(phone_temp) == 6:
                    del phone_temp[3]
                    del phone_temp[4]
                    del phone_temp[5]
            else:
                phone_temp.append('None')
                phone_temp.append('None')
                phone_temp.append('None')

            data_company.append({
                'Name':companyName,
                'Membership':membershipLvl,
                'Category':companyCtgry,
                'Description':companyDesc,
                'Address':companyAddr,
                'City':companyCity,
                'Province':companyProv
            })

            phone_wa.append(phone_temp)

            print(companyName)
            print(companyCity)
            print(companyProv)
            print(phone_temp)
            print('')
            phone_temp = []
    
    tempData = data_company
    for index in range(len(data_company)):
        for number in range(len(phone_wa[index])):
            tempData[index]['telp' + str(number+1)] = phone_wa[index][number]

    
    df = pd.DataFrame(tempData)

    df.to_csv('data_company' + str(z + i + 1) + '.csv', index=False, encoding="utf-8")