In [1]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from configs import headers_list

In [2]:
URL = 'https://www.value.today/headquarters/{}?title=&field_headquarters_of_company_target_id&field_company_category_primary_target_id&field_market_value_jan_2020_value_1=&page={}'
URL

'https://www.value.today/headquarters/{}?title=&field_headquarters_of_company_target_id&field_company_category_primary_target_id&field_market_value_jan_2020_value_1=&page={}'

In [6]:
def scrape_value_today(country):
    page_no = 0
    companies = []
    start_time = time.time()
    print('Scraping companies from {}'.format(country.title()))
    while True:
        page_time = time.time()
        page = requests.get(URL.format(country, page_no), headers=random.choice(headers_list))
        soup = BeautifulSoup(page.content, 'html.parser')
        results = soup.find(class_='item-list')
        # End of page
        if results is None:
            break
        job_elems = results.find_all('li', class_='row well')
        for job_elem in job_elems:
            companies.append(extract_company(job_elem))
        page_no += 1
        # Wait for a few seconds
        time.sleep(random.uniform(4, 12))
        print('Page no {} done. Time taken: {}s'.format(page_no, int(time.time() - page_time)))
    df = pd.DataFrame.from_dict(companies)
    print('Total time taken: {}s'.format(int(time.time() - start_time)))
    return df

def find_class(job_elem, field, _type, sm=6, label='above'):
    _class = 'clearfix col-sm-{} field field--name-field-{} field--type-{} field--label-{}'\
        .format(sm, field, _type, label)
    return job_elem.find('div', class_=_class)

def get_texts(field):
    try:
        texts = [c.text for c in field.find_all('div', class_='field--item')]
        return '; '.join(texts)
    except:
        return None

def get_text(field):
    try:
        return field.find('div', class_='field--item').text
    except:
        return None

def extract_company(job_elem):
    company = {}
    # Name
    name = job_elem.find('h2', class_='text-primary')
    company['name'] = name.find('a').text
    # Market value 2021
    market_value = find_class(job_elem, 'market-value-jan012021', 'float')
    company['market_value_21'] = get_text(market_value)
    # World rank 2021
    world_rank = find_class(job_elem, 'world-rank-jan012021', 'integer')
    company['world_rank_21'] = get_text(world_rank)
    # Market value 2020
    market_value = find_class(job_elem, 'market-value-jan012021', 'float')
    company['market_value_20'] = get_text(market_value)
    # World rank 2020
    world_rank = find_class(job_elem, 'world-rank-jan012021', 'integer')
    company['world_rank_20'] = get_text(world_rank)
    # Country
    country = find_class(job_elem, 'headquarters-of-company', 'entity-reference', sm=12)
    company['country'] = get_texts(country)
    # CEO
    ceo = find_class(job_elem, 'ceo', 'entity-reference', sm=12)
    company['ceo'] = get_text(ceo)
    # Website
    website = find_class(job_elem, 'company-website', 'link', sm=12)
    try:
        company['website'] = website.find('a', href=True)['href']
    except:
        company['website'] = None
    # Sector
    category = find_class(job_elem, 'company-category-primary', 'entity-reference', sm=12)
    company['sector'] = get_texts(category)
    # Employee count
    employee_count = find_class(job_elem, 'employee-count', 'integer', sm=12, label='inline')
    employee_count = get_text(employee_count)
    if employee_count is not None:
        company['employee_count'] = int(employee_count.replace(',', ''))
    # Annual revenue
    annual_revenue = find_class(job_elem, 'annual-revenue', 'string', sm=12)
    annual_revenue = get_text(annual_revenue)
    if annual_revenue is not None:
        company['annual_revenue'] = ' '.join(annual_revenue.split()[-3:])
    # Annual net income
    annual_net_income = find_class(job_elem, 'annual-net-income-lc', 'string', sm=12)
    annual_net_income = get_text(annual_net_income)
    if annual_net_income is not None:
        company['annual_net_income'] = ' '.join(annual_net_income.split()[-3:])
    # Stock exchange
    stock_exchange = find_class(job_elem, 'stock-exchange-lc', 'entity-reference', sm=12, label='inline')
    company['stock_exchange'] = get_texts(stock_exchange)
    # Stock category
    stock_category = find_class(job_elem, 'stock-category-lc', 'entity-reference', sm=12, label='inline')
    company['stock_category'] = get_texts(stock_category)
    # Description
    summary = job_elem.find('div', class_='clearfix col-sm-12 field field--name-body field--type-text-with-summary field--label-above')
    try:
        company['description'] = summary.find('div', class_='field--item').text
    except:
        company['description'] = None
    return company

In [7]:
df_sg = scrape_value_today('singapore')
df_sg.head()

Scraping companies from Singapore
Page no 1 done. Time taken: 7s
Page no 2 done. Time taken: 10s
Page no 3 done. Time taken: 7s
Page no 4 done. Time taken: 7s
Page no 5 done. Time taken: 6s
Page no 6 done. Time taken: 10s
Page no 7 done. Time taken: 8s
Page no 8 done. Time taken: 12s
Page no 9 done. Time taken: 5s
Page no 10 done. Time taken: 7s
Page no 11 done. Time taken: 8s
Page no 12 done. Time taken: 8s
Page no 13 done. Time taken: 9s
Page no 14 done. Time taken: 8s
Page no 15 done. Time taken: 9s
Page no 16 done. Time taken: 9s
Page no 17 done. Time taken: 10s
Page no 18 done. Time taken: 11s
Page no 19 done. Time taken: 7s
Page no 20 done. Time taken: 11s
Page no 21 done. Time taken: 9s
Page no 22 done. Time taken: 11s
Page no 23 done. Time taken: 6s
Page no 24 done. Time taken: 10s
Page no 25 done. Time taken: 6s
Page no 26 done. Time taken: 11s
Total time taken: 237s


Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
0,DBS GROUP HOLDINGS,48.149 Billion USD,339,48.149 Billion USD,339,Singapore,Piyush Gupta,https://www.dbs.com,Financial Services; Banking Services,28526.0,14.54 Billion SGD,6.39 Billion SGD,Singapore; Frankfurt,Singapore ADRs,DBS GROUP HOLDINGS is Banking and Financial Se...
1,SINGAPORE TELECOMMUNICATIONS (SINGTEL),28.541 Billion USD,642,28.541 Billion USD,642,Singapore,Chua Sock Koong,https://www.singtel.com/,Communication Services; Telecom; Mobile Services,25000.0,"17,371 Million SGD","3,094 Million SGD",Singapore,Singapore ADRs,Telecom Company with headquarters in Singapore...
2,OVERSEA-CHINESE BANKING CORPORATION,33.538 Billion USD,533,33.538 Billion USD,533,Singapore,Samuel N. Tsien,http://www.ocbc.com,Financial Services; Banking Services; Asset Ma...,29706.0,10777 Million USD,4810 Million USD,Singapore,,Banking and Financial services company with he...
3,UNITED OVERSEAS BANK,28.536 Billion USD,643,28.536 Billion USD,643,Singapore,Wee Ee Cheong,http://www.uobgroup.com/,Financial Services; Banking Services,25000.0,"10,029 Million SGD","4,245 Million SGD",Singapore,Singapore ADRs,Multinational banking company with headquarter...
4,WILMAR INTERNATIONAL,22.223 Billion USD,828,22.223 Billion USD,828,Singapore,,https://www.wilmar-international.com/,Consumer Defensive; Agriculture Products; FMCG,90000.0,42640 Million USD,1293 Million USD,Singapore; Frankfurt,Singapore ADRs,Agriculture related business with headquarters...


In [8]:
df_sg.to_csv('results/value_singapore.csv', index=False)

In [9]:
df_id = scrape_value_today('indonesia')
df_id.head()

Scraping companies from Indonesia
Page no 1 done. Time taken: 7s
Page no 2 done. Time taken: 9s
Page no 3 done. Time taken: 12s
Page no 4 done. Time taken: 11s
Page no 5 done. Time taken: 11s
Page no 6 done. Time taken: 13s
Page no 7 done. Time taken: 8s
Page no 8 done. Time taken: 10s
Page no 9 done. Time taken: 7s
Page no 10 done. Time taken: 11s
Page no 11 done. Time taken: 12s
Page no 12 done. Time taken: 11s
Page no 13 done. Time taken: 11s
Page no 14 done. Time taken: 12s
Page no 15 done. Time taken: 6s
Page no 16 done. Time taken: 7s
Page no 17 done. Time taken: 12s
Page no 18 done. Time taken: 6s
Page no 19 done. Time taken: 13s
Page no 20 done. Time taken: 8s
Page no 21 done. Time taken: 12s
Page no 22 done. Time taken: 9s
Page no 23 done. Time taken: 8s
Page no 24 done. Time taken: 9s
Page no 25 done. Time taken: 7s
Page no 26 done. Time taken: 6s
Page no 27 done. Time taken: 11s
Page no 28 done. Time taken: 8s
Page no 29 done. Time taken: 10s
Page no 30 done. Time taken: 6s


Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
0,PT BANK CENTRAL ASIA TBK,60.700 Billion USD,253,60.700 Billion USD,253,Indonesia,Jahja Setiaatmadja,https://www.bca.co.id/en,Financial Services; Banking Services; Credit C...,24789.0,"72,968 Billion IDR","28,565 Billion IDR",Frankfurt,,PT BANK CENTRAL ASIA TBK is Bank with headquar...
1,PT BANK RAKYAT INDONESIA (PERSERO) TBK,38.040 Billion USD,457,38.040 Billion USD,457,Indonesia,,http://www.ir-bri.com/,Financial Services; Banking Services,61800.0,108468 Billion IDR,34372 Billion IDR,Frankfurt,,PT BANK RAKYAT INDONESIA (PERSERO) TBK is Bank...
2,PT TELEKOMUNIKASI INDONESIA TBK,23.081 Billion USD,799,23.081 Billion USD,799,Indonesia,,https://www.telkom.co.id,Communication Services; Telecom; Mobile Services,24272.0,Year ending December-2019,Year ending December-2019,NYSE; Frankfurt,Indonesia ADRs,"Telecom company with headquarters in Jakarta, ..."
3,PT BANK MANDIRI (PERSERO) TBK,21.820 Billion USD,847,21.820 Billion USD,847,Indonesia,,https://www.bankmandiri.co.id/,Financial Services; Banking Services,39000.0,Year ending December-2020,Year ending December-2020,Jakarta,,PT BANK MANDIRI (PERSERO) TBK is Banking and F...
4,PT UNILEVER INDONESIA TBK,20.530 Billion USD,891,20.530 Billion USD,891,Indonesia,,https://www.unilever.co.id/,Consumer Defensive; FMCG; Consumer Goods,5400.0,Year ending December-2020,Year ending December-2020,Jakarta,,FMCG sector which manufacture and markets like...


In [10]:
df_id.to_csv('results/value_indonesia.csv', index=False)

In [11]:
df_th = scrape_value_today('thailand')
df_th.head()

Scraping companies from Thailand
Page no 1 done. Time taken: 5s
Page no 2 done. Time taken: 6s
Page no 3 done. Time taken: 13s
Page no 4 done. Time taken: 7s
Page no 5 done. Time taken: 11s
Page no 6 done. Time taken: 11s
Page no 7 done. Time taken: 6s
Page no 8 done. Time taken: 8s
Page no 9 done. Time taken: 8s
Page no 10 done. Time taken: 10s
Page no 11 done. Time taken: 6s
Page no 12 done. Time taken: 7s
Page no 13 done. Time taken: 9s
Page no 14 done. Time taken: 12s
Page no 15 done. Time taken: 6s
Page no 16 done. Time taken: 6s
Page no 17 done. Time taken: 13s
Page no 18 done. Time taken: 12s
Page no 19 done. Time taken: 13s
Page no 20 done. Time taken: 9s
Page no 21 done. Time taken: 8s
Page no 22 done. Time taken: 7s
Page no 23 done. Time taken: 8s
Page no 24 done. Time taken: 12s
Page no 25 done. Time taken: 9s
Page no 26 done. Time taken: 6s
Page no 27 done. Time taken: 14s
Page no 28 done. Time taken: 13s
Page no 29 done. Time taken: 10s
Page no 30 done. Time taken: 6s
Page

Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
0,PTT PUBLIC COMPANY,40.930 Billion USD,423,40.930 Billion USD,423,Thailand,Chansin Treenuchagron,http://www.pttplc.com,Energy; Oil and Gas; Coal Mining,10875.0,"2,219 Billion THB",91.4 Billion THB,Thailand,,Oil and Gas company with headquarters in Thail...
1,AIRPORTS OF THAILAND,29.643 Billion USD,611,29.643 Billion USD,611,Thailand,,https://www.airportthai.co.th/en/,Industries; Airports Operating; Infrastructure,7230.0,Year ending September-2020,Year ending September-2020,Frankfurt,Thailand ADRs,AIRPORTS OF THAILAND is Thailand based company...
2,ADVANCED INFO SERVICE PUBLIC COMPANY,17.448 Billion USD,1034,17.448 Billion USD,1034,Thailand,,http://investor.ais.co.th/,Communication Services; Telecom; Mobile Services,12701.0,Year ending December-2020,Year ending December-2020,Frankfurt,Thailand ADRs,AIS is telecom company with headquarters in Th...
3,CP ALL PUBLIC COMPANY,17.442 Billion USD,1035,17.442 Billion USD,1035,Thailand,,https://www.cpall.co.th/en/home/,Consumer Defensive; Retail; Convenience Stores,62556.0,Year ending December-2020,Year ending December-2020,Thailand,Thailand ADRs,CP All operates chain of convenience stores ac...
4,THAI BEVERAGE,13.968 Billion USD,1262,13.968 Billion USD,1262,Thailand,,http://www.thaibev.com/,Consumer Defensive; Alcoholic Beverages; Bever...,,Year ending September-2020,Year ending September-2020,Singapore,,Thai Beverage is alcoholic beverages company w...


In [12]:
df_th.to_csv('results/value_thailand.csv', index=False)

In [13]:
df_ph = scrape_value_today('philippines')
df_ph.head()

Scraping companies from Philippines
Page no 1 done. Time taken: 5s
Page no 2 done. Time taken: 11s
Total time taken: 19s


Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
0,SM INVESTMENTS CORPORATION,26.304 Billion USD,699,26.304 Billion USD,699,Philippines,,https://www.sminvestments.com/,Conglomerate; Investments; Retail,157288.0,Year ending December-2020,Year ending December-2020,,,SM Investments Corporation is Conglomerate com...
1,AYALA CORPORATION,17.688 Billion USD,1015,17.688 Billion USD,1015,Philippines,,https://www.ayala.com.ph/,Industries; Conglomerate; Real Estate,64373.0,,,PSE,Philippines ADRs,Ayala Corporation is the oldest and one of the...
2,BDO UNIBANK,9.747 Billion USD,1713,9.747 Billion USD,1713,Philippines,,https://www.bdo.com.ph,Financial Services; Banking Services,38510.0,,,PSE,Philippines ADRs,BDO Unibank provides banking and financial ser...
3,MANILA ELECTRIC COMPANY,6.851 Billion USD,2287,6.851 Billion USD,2287,Philippines,,https://www.meralco.com.ph/,Utilities; Electric Utilities,5700.0,,,PSE,Philippines ADRs,Meralco also knows as Manila Electric Company ...
4,JOLLIBEE FOODS CORPORATION,4.501 Billion USD,3194,4.501 Billion USD,3194,Philippines,,https://www.jollibee.com.ph/,Consumer Cyclical; Retail; Food Chains / Distr...,16690.0,,,PSE,,Jollibee Foods Corporation is the largest fast...


In [14]:
df_ph.to_csv('results/value_philippines.csv', index=False)

In [21]:
df_sg['country'] = 'Singapore'
df_id['country'] = 'Indonesia'
df_th['country'] = 'Thailand'
df_ph['country'] = 'Philippines'
df = df_sg.append(df_id)
df = df.append(df_th)
df = df.append(df_ph)
print(len(df))
df = df.reset_index(drop=True)
df.head()

1479


Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
0,DBS GROUP HOLDINGS,48.149 Billion USD,339,48.149 Billion USD,339,Singapore,Piyush Gupta,https://www.dbs.com,Financial Services; Banking Services,28526.0,14.54 Billion SGD,6.39 Billion SGD,Singapore; Frankfurt,Singapore ADRs,DBS GROUP HOLDINGS is Banking and Financial Se...
1,SINGAPORE TELECOMMUNICATIONS (SINGTEL),28.541 Billion USD,642,28.541 Billion USD,642,Singapore,Chua Sock Koong,https://www.singtel.com/,Communication Services; Telecom; Mobile Services,25000.0,"17,371 Million SGD","3,094 Million SGD",Singapore,Singapore ADRs,Telecom Company with headquarters in Singapore...
2,OVERSEA-CHINESE BANKING CORPORATION,33.538 Billion USD,533,33.538 Billion USD,533,Singapore,Samuel N. Tsien,http://www.ocbc.com,Financial Services; Banking Services; Asset Ma...,29706.0,10777 Million USD,4810 Million USD,Singapore,,Banking and Financial services company with he...
3,UNITED OVERSEAS BANK,28.536 Billion USD,643,28.536 Billion USD,643,Singapore,Wee Ee Cheong,http://www.uobgroup.com/,Financial Services; Banking Services,25000.0,"10,029 Million SGD","4,245 Million SGD",Singapore,Singapore ADRs,Multinational banking company with headquarter...
4,WILMAR INTERNATIONAL,22.223 Billion USD,828,22.223 Billion USD,828,Singapore,,https://www.wilmar-international.com/,Consumer Defensive; Agriculture Products; FMCG,90000.0,42640 Million USD,1293 Million USD,Singapore; Frankfurt,Singapore ADRs,Agriculture related business with headquarters...


In [22]:
for i, row in df.iterrows():
    try:
        df.loc[i, 'world_rank_21'] = int(row['world_rank_21'])
    except:
        pass
    try:
        df.loc[i, 'world_rank_20'] = int(row['world_rank_20'])
    except:
        pass

df = df.sort_values(by='world_rank_21')
df.head()

Unnamed: 0,name,market_value_21,world_rank_21,market_value_20,world_rank_20,country,ceo,website,sector,employee_count,annual_revenue,annual_net_income,stock_exchange,stock_category,description
5,SEA LIMITED,101.750 Billion USD,124,101.750 Billion USD,124,Singapore,,https://www.seagroup.com,Communication Services; Gaming; Software Products,29800.0,2175 Million USD,-1462 Million USD,NYSE,Singapore ADRs,Seal Limited is Electronic gaming company with...
253,PT BANK CENTRAL ASIA TBK,60.700 Billion USD,253,60.700 Billion USD,253,Indonesia,Jahja Setiaatmadja,https://www.bca.co.id/en,Financial Services; Banking Services; Credit C...,24789.0,"72,968 Billion IDR","28,565 Billion IDR",Frankfurt,,PT BANK CENTRAL ASIA TBK is Bank with headquar...
0,DBS GROUP HOLDINGS,48.149 Billion USD,339,48.149 Billion USD,339,Singapore,Piyush Gupta,https://www.dbs.com,Financial Services; Banking Services,28526.0,14.54 Billion SGD,6.39 Billion SGD,Singapore; Frankfurt,Singapore ADRs,DBS GROUP HOLDINGS is Banking and Financial Se...
866,PTT PUBLIC COMPANY,40.930 Billion USD,423,40.930 Billion USD,423,Thailand,Chansin Treenuchagron,http://www.pttplc.com,Energy; Oil and Gas; Coal Mining,10875.0,"2,219 Billion THB",91.4 Billion THB,Thailand,,Oil and Gas company with headquarters in Thail...
254,PT BANK RAKYAT INDONESIA (PERSERO) TBK,38.040 Billion USD,457,38.040 Billion USD,457,Indonesia,,http://www.ir-bri.com/,Financial Services; Banking Services,61800.0,108468 Billion IDR,34372 Billion IDR,Frankfurt,,PT BANK RAKYAT INDONESIA (PERSERO) TBK is Bank...


In [23]:
df.to_csv('results/value_today.csv', index=False)