In [1]:
import requests, json
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from urllib.parse import urlparse
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

In [2]:
def requests_retry(
    retries=3,  # 3 times try
    backoff_factor=0.3,
    status_forcelist=(500, 502,503,504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [3]:
def get_soup_handle(url):
    r = requests_retry().get(url, headers=headers, timeout=10)
    c= r.content
    soup=BeautifulSoup(c,"html.parser")
    return soup


In [4]:
#### get about links
def get_about_links(soup,frame,home_url):
    
    urls = soup.find_all("a")
    about_urls = []
    for url in urls:
        if 'about' in str(url.get("href")).lower()  and frame == 0:
#             print("in about non frame part")
            about_urls.append(url.get("href"))
            
        if 'about' in str(url.get("href")).lower()  and frame == 1:
#             print("in about frame part")
            about_urls.append(home_url + url.get("href"))

#             print(url.get("href"))
    return about_urls        

In [5]:
#### get prod and service links
def get_service_product_links(soup,frame,home_url):

    urls = soup.find_all("a")
    about_urls = []
    for url in urls:
        if frame == 0 and ('service' in str(url.get("href")).lower() or 'product' in str(url.get("href")).lower() or 'offerings' in str(url.get("href")).lower() or 'item' in str(url.get("href")).lower()):
#             print("in product non frame part")
            about_urls.append(url.get("href"))

        if frame == 1 and ('service' in str(url.get("href")).lower() or 'product' in str(url.get("href")).lower() or 'offerings' in str(url.get("href")).lower() or 'item' in str(url.get("href")).lower()):
#             print("in product frame part")
            about_urls.append(home_url + url.get("href"))

        #             print(url.get("href"))
    return about_urls        

In [6]:
def get_text(soup):
    tags = ['p','ul']
    full_text = ""
    for tag in tags:
        extracts = soup.find_all(tag)
        for extract in extracts:
            full_text = str(full_text) + str(extract.text)
    if full_text == "":
        full_text = "Null scraped"
    return full_text

In [7]:
def get_content(home_soup,home_url,frame=0):
    
    dict_web = {}  
    ## get home page text
    home_page_text = get_text(home_soup)
    dict_web["home"] = home_page_text

#     ## get meta text
#     meta_text = get_meta(home_soup)
#     dict_web["meta"] = meta_text

    ## get about links and text
    about_urls = get_about_links(home_soup,frame,home_url)
    about_page_text = ""
    prod_page_text = ""
    if len(about_urls) > 0:
        about_urls = list(set(about_urls))
        for url in about_urls:
            try:
                soup = get_soup_handle(url)
                about_page_text = about_page_text + get_text(soup)
            except:
                continue
        if about_page_text == "":
            about_page_text = "Null scraped"

    dict_web["about"] = about_page_text

        ## get product, service and offerings links and text
    prod_urls = get_service_product_links(home_soup,frame,home_url)
    if len(prod_urls) > 0:
        prod_urls = list(set(prod_urls))
        for url in prod_urls:
            try:
                soup = get_soup_handle(url)
                prod_page_text = prod_page_text + get_text(soup)
            except:
                continue
        if prod_page_text == "":
            prod_page_text = "Null scraped"

    dict_web["prod"] = prod_page_text
    
        
    return dict_web

In [8]:
def get_total_content(url):
#     print("requesting content")
    try:
        r=requests.get(url, headers=headers, timeout=10)
    except:
#         print(url)
#         print("exiting")
        content = {}
        content['about'] = "url not reachable"
        content['home'] = "url not reachable"
        content['prod'] = "url not reachable"
#         content['meta'] = "url not reachable"
        return content
    c= r.content
#     print("content collected")
    soup=BeautifulSoup(c,"html.parser")
#     print("got soup handle")
    try:
            #### handle as per frame
#         print("entering")
        frame = soup.frame.extract()
        frame_link = frame.get("src")
        r=requests.get(frame_link, headers=headers, timeout=10)
        c= r.content
        frame_soup=BeautifulSoup(c,"html.parser")
#         print("fetching frame contents")
        content = get_content(frame_soup,frame_link,frame=1)
    except:
        #### handle as normal
#         print("fetching normal contents")
        content = get_content(soup,url,frame=0)
    return content    
    



In [9]:
### read data

# df = pd.read_excel("C:/Users/arvind.baranwal/Downloads/starstone_more_data_for_web_scrape.xlsx")
# data = pd.read_json("companies_with_website.json",lines=True)
with open('C:/Users/arvind.baranwal/Downloads/website_to_NAICS/companies_with_website.json') as json_file:  
    data = json.load(json_file)
# data = json.loads("companies_with_website.json")

In [10]:
len(data)

848284

In [11]:
data = data[420000:848284]

In [12]:
len(data)

428284

In [13]:
urls = []
for i in range(0,len(data)):
    urls.append(data[i]['website_1'])

In [14]:
len(urls)

428284

In [15]:
for i in range(0,len(urls)):
    urls[i] = "http://" + urls[i]

In [16]:
urls

['http://www.metalmag.com',
 'http://www.cm201u.org',
 'http://www.laurathompsondesign.net',
 'http://www.rafscleaning.com',
 'http://www.adamsservices.com',
 'http://www.mackenzieco.com',
 'http://www.royalbankpa.com',
 'http://www.pinkerton.com',
 'http://www.pacificcoastcopperrepipe.com',
 'http://www.mcdavid.com',
 'http://www.1mbh.com',
 'http://www.thepismobeachhotel.com',
 'http://www.avalon.net',
 'http://www.ci.wixom.mi.us',
 'http://www.beckerperio.com',
 'http://www.shelby-group.com',
 'http://www.williamstown.k12.ma.us',
 'http://www.gregorypoole.com',
 'http://www.wakemed.org',
 'http://www.keywestlawyer.com',
 'http://www.vcsd.org',
 'http://www.schaumburgarchitects.com',
 'http://www.winstonpublishing.com',
 'http://www.leealanbryant.com',
 'http://www.kofflerboats.com',
 'http://www.tlsinc.com',
 'http://www.dillard.edu',
 'http://www.booksnmagz.com',
 'http://www.univision.com',
 'http://www.ccnsy.com',
 'http://www.hermosawaveinternet.com',
 'http://www.rossstores.com

In [17]:
count = 0
for i in range(0,len(urls)):
    if "www" in urls[i]:
        count+=1

In [18]:
count

428284

In [None]:
data = None

In [None]:
results = []
for i in range(25650,len(urls)):
    url = urls[i]
#     print(url)
#     name = names[i]
#     print(str(i+1)+" records to be written")
    content = get_total_content(url)
#     content["_id"] = str(index[i])
#     content["name"] = str(names[i])
    content["url"] = str(url)
#     print(i)
    results.append(content)
    if (i+1)%50 == 0:
        print(str(i+1)+" records done")
        with open( 'webdata25650+.json', 'w+', encoding='utf8') as outfile1:
            json.dump(results, outfile1, indent=4, sort_keys=True, ensure_ascii=False)


In [13]:
with open( 'C:/Users/arvind.baranwal/Downloads/webdata3042more.json', 'w+', encoding='utf8') as outfile1:
    json.dump(results, outfile1, indent=4, sort_keys=True, ensure_ascii=False)


In [28]:
names[7]

'Advantage Storage Company LLC'

In [29]:
urls[7]

'http://uhaul.com'

In [21]:
results[0]

{'about': "Millcreek Tile & Stone is a commercial tile and stone installer based in Salt Lake City, Utah. Started in 2004 and built from an experienced management and installation core, Millcreek Tile & Stone focuses on commercial projects large and small.Along with our expertise in both project management and execution, we are committed to delivering a quality and durable finished product to our clients. We adhere to established TCNA and ICPI industry standards and also seek for creative and economical solutions to construction problems. We belong to the National Tile Contractors Association (NTCA) and are members of the Associated General Contractors of Utah (AGC).We specialize in any size commercial installation of porcelain and ceramic tile, stone tile, dairy and fully vitrified paving systems and interlocking concrete or stone paving systems. CSI sections 093000 ceramic tile, 096000 stone tile, and 027800 interlocking concrete (or stone) paving systems. We also install mechanicall

In [11]:
with open('C:/Users/arvind.baranwal/Downloads/website_to_NAICS/webdata.json',encoding='utf8') as json_file:  
    data = json.load(json_file)


In [12]:
len(data)

30400

In [13]:
len(data[0:2800])

2800

In [14]:
with open('C:/Users/arvind.baranwal/Downloads/website_to_NAICS/webdata2800+.json',encoding='utf8') as json_file:  
    data1 = json.load(json_file)


In [15]:
len(data1)

1200

In [18]:
total_data = data[0:2800]+data1

In [19]:
len(total_data)

4000

In [20]:
with open( 'C:/Users/arvind.baranwal/Downloads/webdata_till_4000.json', 'w+', encoding='utf8') as outfile1:
    json.dump(total_data, outfile1, indent=4, sort_keys=True, ensure_ascii=False)


In [17]:
data[0].keys()

dict_keys(['_id', 'about', 'home', 'meta', 'name', 'prod', 'url'])

In [13]:
import time

In [21]:
print("starting to measure time")
start = time.time()

results = []
for i in range(0,10):
    url = urls[i]
#     print(url)
#     name = names[i]
#     print(str(i+1)+" records to be written")
    content = get_total_content(url)
#     print(i)
    results.append(content)
# 
end = time.time()
print("time ended")

print(end - start)

starting to measure time
time ended
468.04769134521484


In [12]:
from multiprocessing import Pool

In [14]:
urls = urls[0:10]

In [None]:
start = time.time()
p = Pool(10)  # Pool tells how many at a time
records = p.map(get_total_content, urls[0:1])
p.terminate()
p.join()
end = time.time()
print(end - start)

In [None]:
C:\Users\arvind.baranwal\Downloads\website_to_NAICS