In [8]:
import requests, re, time, warnings, s3fs
import pandas as pd
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup as bs
from multiprocessing import Pool
from urllib.parse import urlparse
from fuzzywuzzy import fuzz

warnings.filterwarnings('ignore')

In [9]:
# function to create domain based off website
def create_do(web):
    
    if type(web)==str and web!='none':
        if '&' in web:
            web = web.split('&')[0].strip()
        web = web.lower().replace('https://','').replace('http://','').replace('htt:','').replace('www.','').strip()
        if '/' in web:
            web = web.split('/')[0].replace(' ','').strip()
        if '?' in web:
            web = web.split('?')[0].replace(' ','').strip()
        if '@' in web or '.' not in web or ' ' in web:
            return None
        
        return web
    else:
        return None

In [10]:
# create website using http:// + domain
def create_website_1(domain_clean):
    
    if domain_clean:
        website = 'http://' + domain_clean
    else:
        website = None
        
    return website

# create website using https:// + domain
def create_website_2(domain_clean):
    
    if domain_clean:
        website = 'https://' + domain_clean
    else:
        website = None
        
    return website

# create website using https://www. + domain
def create_website_3(domain_clean):
    
    if domain_clean:
        if 'ww' not in domain_clean:
            website = 'https://www.' + domain_clean
        else:
            website = None
    else:
        website = None
        
    return website

# clean web page text
def clean_text(html_text):
    try:
        clean_text = re.sub(r"\s+", " ", html_text).strip()    
        return clean_text
    except:
        return None
    
# find meta descriptions from webpage
def get_meta(soup_object):
    
    if soup_object:
        try:
            meta1 = soup_object.find("meta", {"property":"og:description"})['content'].replace('\n','').replace('\t','').strip()
        except:
            meta1=None
        
        try:
            meta2 = soup_object.find("meta", {"name":"description"})['content'].replace('\n','').replace('\t','').strip()   
        except:
            meta2=None
        
        # if both exists, choose the meta description with longest length
        if meta1 and meta2:
            
            if len(meta1)>=len(meta2):

                return meta1
            else:
                return meta2
            
        elif meta1:
            return meta1
        
        elif meta2:
            return meta2
        
        else:
            return None
            
    else:
        return None

# find keywords from meta tag
def get_keywords(soup_object):
    
    if soup_object:
        try:
            keywords = soup_object.find("meta", {"name":"keywords"})['content'].replace('\n','').replace('\t','').lower().strip()
        except:
            keywords = None
            
        return keywords  
    else:
        return None

In [11]:
# import data
df = pd.read_csv('s3://gazelle-webscraping/web_scraper_urls.csv',
                index_col=False)

In [22]:
# indication of a website now for sale i.e if word appear in redirected domain --- >> sale
for_sale = ['forsale','godaddy', 'domain']

In [13]:
def get_data(company_website):
    
    try:
        
        domain = create_do(company_website)
        
        # first try to scrape the website
        webs = create_website_1(domain)
        r = requests.get(webs,
                             timeout=18.0,
                             allow_redirects=True,
                             verify=False)
        soup = bs(r.text)
        
        if len(soup.text)<30:
            # second try to scrape the website
            webs = create_website_2(domain)
            r = requests.get(webs,
                             timeout=18.0,
                             allow_redirects=True,
                             verify=False)
            soup = bs(r.text)
            
            if len(soup.text)<30:
                # third try to scrape the website
                webs = create_website_3(domain)
                r = requests.get(webs,
                                 timeout=18.0,
                                 allow_redirects=True,
                                 verify=False)
                soup = bs(r.text)
        
    except:
        
        soup=None
        
    if soup:
        
        status = r.status_code
        returned_web = r.url
        
        # create domain of the redirected website
        try:
            returned_dom = create_do(returned_web)
        except:
            returned_dom = ''
        
        # if redirected domain is in the for_sale list, indicate website is not valid
        try:
            website_sale = 0
            for sale in for_sale:
                if sale in returned_dom:
                    website_sale = 1
                    break
        except:
            website_sale = 0
            
        # find fuzzy match ratios of initial domain and redirected domain
        try:
            ratio = fuzz.ratio(domain, returned_dom)
            ratio_part = fuzz.partial_ratio(domain, returned_dom)
        except:
            ratio = ''
            ratio_part = ''
            
        meta_description = get_meta(soup)
        keyword = get_keywords(soup)
        text = clean_text(soup.text)
            
        return {
            'website': company_website,
            'redirected_domain': returned_dom,
            'ratio': ratio,
            'partial_ratio': ratio_part,
            'status_code': status,
            'website_for_sale': website_sale,
            'meta_description': meta_description,
            'keywords': keyword,
            'page_text': text
        }

In [99]:
%%time
workers = 10
with Pool(workers) as p:
    data = p.map(get_data, list(set(df['website'])))

CPU times: user 2.38 s, sys: 963 ms, total: 3.34 s
Wall time: 1h 9min 2s


In [101]:
valid_returns = []
for d in list(data):
    if d:
        valid_returns.append(d)
result_df = pd.DataFrame(valid_returns)

In [16]:
result_df

Unnamed: 0,website,redirected_domain,ratio,partial_ratio,status_code,website_for_sale,meta_description,keywords,page_text
0,www.vancouverwe.com,vancouverwe.com,100,100,200,0,Help inner-city kids get excited about reading...,,The Writers’ Exchange | Vancouver Volunteer Op...
1,www.mmuc.org,mmuc.org,100,100,200,0,,,"Marshall Memorial United Church - Ancaster, ON..."
2,www.prydelearningcentres.ca,prydelearningcentres.ca,100,100,200,0,Childcare and early learning programs,,P.R.Y.D.E. Learning Centres Home About Parent ...
3,www.zacjc.ca,zacjc.ca,100,100,200,0,Zion Apostolic Church of Jesus Christ invites ...,"bishop lindsay, apostolic, sunrise avenue, zac...",Home | Toronto | Zion Apostolic Church of Jesu...
4,www.melfortmfa.webs.com,melfortmfa.webs.com,100,100,200,0,This is the link to the Melfort Music Festival,melfort music festival,Melfort Music Festival Association Melfort Mus...
...,...,...,...,...,...,...,...,...,...
26210,questchc.ca,questchc.ca,100,100,200,0,,,Quest | Community Health Centre About Quest CH...
26211,www.elkriveralliance.ca,elkriveralliance.ca,100,100,200,0,,,Elk River Alliance About About Team Board of D...
26212,ctvlionstelethon.ca,ctvlionstelethon.ca,100,100,403,0,,,Forbidden
26213,www.howickunitedchurch.ca,howickunitedchurch.ca,100,100,200,0,,,Welcome to Howick United Church Website We wou...


In [18]:
# merge back
df1 = df.merge(result_df,on='website',how='left')
df1

Unnamed: 0,BN,website,redirected_domain,ratio,partial_ratio,status_code,website_for_sale,meta_description,keywords,page_text
0,106693120RR0001,www.albertadebate.com,albertadebate.com,100.0,100.0,200.0,0.0,,,Home
1,118799154RR0001,www.barbercollins.com,barbercollins.ca,91.0,94.0,200.0,0.0,,,Home Home Home Home Home Home Home Home Home H...
2,119033702RR0001,www.marmorahistory.ca,marmorahistory.ca,100.0,100.0,429.0,0.0,,,429 Too Many Requests 429 Too Many Requests Pl...
3,119232957RR0001,www.winnipegfiremuseum.ca,winnipegfiremuseum.ca,100.0,100.0,401.0,0.0,,,401 Unauthorized Unauthorized This server coul...
4,805695517RR0001,www.thepinkshields.org,thepinkshields.org,100.0,100.0,200.0,0.0,,,The Pink Shield Charitable Foundation The Pink...
...,...,...,...,...,...,...,...,...,...,...
33664,894396340RR0001,geocities.com/the kwia,yahoo.com,45.0,44.0,200.0,0.0,"Latest news coverage, email, free stock quotes...","yahoo, yahoo home page, yahoo homepage, yahoo ...","Yahoo | Mail, Weather, Search, Politics, News,..."
33665,808165484RR0001,ministerionuevatierra.org,ministerionuevatierra.org,100.0,100.0,200.0,0.0,,brooks new land ministriesbrooks new land mini...,Ministerio Nueva Tierra Copyright © 2018 · All...
33666,119229938RR0004,www.calgary.anglican.ca,calgary.anglican.ca:443,90.0,100.0,200.0,0.0,Our Diocesan Vision Statement: - To be a chur...,"anglican, christian, missional, calgary, alber...","Anglican Diocese of Calgary | Calgary, AB Sear..."
33667,831632641RR0001,www.pilgrimbaptist.com,pilgrimchurch.ca,53.0,56.0,200.0,0.0,We look like Vancouver. Find your people and s...,"church, anabaptist, baptist, vancouver, eastva...",PILGRIM CHURCH - Home GET CONNECTED NEW HERE? ...


In [19]:
df1.to_csv('britt.csv.zip',
           index=False,
           quoting=2,
          compression='zip')