In [1]:
# Imports 
from googleapiclient.discovery import build
import requests
from requests.auth import HTTPBasicAuth

from bs4 import BeautifulSoup
import pandas as pd
import re
from itertools import product

import urllib
from requests_html import HTML, HTMLSession

from typing import List

#### Creating the Scraper

In [2]:
class Scraper: 
    
    def __init__(self, api_key: str, cse_id: str, headers: dict): 
        
        # Grab args
        self.api_key = api_key
        self.cse_id = cse_id
        self.headers = headers
        
    def search(self, keyword: str, skip_links: list = None): 
        
        """
        Run the google search and get list of top 70 results.
        -- 
        keyword: str -> phrase to search on google 
        skip_links: list -> list of links that we should skip such as 
        """
        
        # Setup service operator 
        service = build("customsearch", "v1", developerKey = self.api_key)
        
        # Make request 
        res_list = []
        for start in [1, 11, 21, 31, 41, 51, 61]:
            res = service.cse().list(q = keyword, cx = self.cse_id, start = start).execute()
            results = [{k:v for k,v in res.items() if k in ['title', 'link', 'snippet']} for res in res['items']]
            res_list.append(results)
            
        # Flatten List
        res_list = [item for sublist in res_list for item in sublist]
        
        # Remove all that contain a skip link
        res_list = [page for page in res_list if not any(substring in page['link'] for substring in skip_links)]
        
        
        return res_list

In [3]:
class SearchResult: 
    
    def __init__(self, scraper: Scraper, url: str, lawyer_type: str, city: str): 
        
        # Grab attributes
        self.url = url
        self.lawyer_type = lawyer_type.lower()
        self.lawyer_type_one_word = self.lawyer_type.replace(" ", "")
        self.city = city.lower()
        self.keyword = self.lawyer_type + ' Lawyer ' + self.city
        self.keyword = self.keyword.lower()
        
        # Make request to get bs4 object 
        req = requests.get(self.url, headers = scraper.headers)
        self.soup = BeautifulSoup(req.content, 'html.parser')

    def get_word_counts(self): 
        
        # Body text
        body_text = re.sub(r'\s+', ' ', self.soup.find('body').text).strip().lower().replace('\n','')
        body_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', body_text)
        
        # Header text 
        try: 
            header_text = re.sub(r'\s+', ' ', self.soup.find('header').text).strip().lower().replace('\n','')
            header_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', header_text)
            header_words = len(header_text.split())
        except: 
            header_text, header_words = "", 0
        
            
        # Footer text 
        try: 
            footer_text = re.sub(r'\s+', ' ', self.soup.find('footer').text).strip().lower().replace('\n','')
            footer_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', footer_text)
            footer_words = len(footer_text.split())
        except: 
            footer_text, footer_words = "", 0
            
            
        # Remove header/footer text
        text = body_text.replace(header_text, '').replace(footer_text, '').strip()
        
        
        # Compute word counts 
        total_words = len(text.split())
        
        word_count_info = {'total_words': total_words, 'header_words': header_words, 
                           'footer_words': footer_words}
                                  
                                  

        # Assign stuff
        self.text = text
        self.body_text = body_text
        self.header_text = header_text
        self.footer_text = footer_text              
        
        
        return word_count_info
        
    def get_keyword_counts(self): 
        
        """
        Computes the keyword counts in various spots within a webpage
        """
    
        # City counts
        city_count = len(re.findall(self.city, self.body_text))
        
        # Lawyer/attorney count 
        lawyer_count = len(re.findall(r'lawyer|attorney', self.text))
        
        # Type count
        type_count = len(re.findall(self.lawyer_type, self.text))
        
        # First keyword occurence
        self.text = self.text.replace(self.lawyer_type, self.lawyer_type_one_word)
        
        keyword_in_text = self.lawyer_type_one_word in self.text
        first_keyword_spot = self.text.split().index(self.lawyer_type_one_word)+1 if keyword_in_text else 'NA'
        keyword_in_200 = (first_keyword_spot < 200) if keyword_in_text else 'NA'
        
        # Output as dict 
        keyword_count_info = {'keyword_count': type_count, 'city_count': city_count, 
                              'lawyer_attorney_count': lawyer_count, 'first_keyword_spot': first_keyword_spot, 
                              'keyword_in_200_words': keyword_in_200}
        return keyword_count_info
    
    def get_alt_text(self, img): 
        try: 
            return img['alt']
        except: 
            return 'NO ALT' 
        
    def get_tag_info(self): 
        
        # Title tag 
        title = self.soup.find('title').text.strip().replace('\n', '')
        keyword_in_title = self.lawyer_type in title.lower()
        
        # H1 
        h1 = self.soup.find_all('h1')
        h1_count = len(h1)
        h1_text = [el.text.strip().replace('\n', '') for el in h1]
        keyword_in_h1 = [self.lawyer_type in el.lower() for el in h1_text]
        
        # H2 
        h2 = self.soup.find_all('h2')
        h2_count = len(h2)
        h2_text = [el.text.strip().replace('\n', '') for el in h2]
        keyword_in_h2 = [self.lawyer_type in el.lower() for el in h2_text]
        
        # H3 
        h3 = self.soup.find_all('h3')
        h3_count = len(h3)
        h3_text = [el.text.strip().replace('\n', '') for el in h3]
        keyword_in_h3 = [self.lawyer_type in el.lower() for el in h3_text]
        
        # Image 
        images = self.soup.find_all('img')
        image_count = len(images)
        alt_text = [self.get_alt_text(img) for img in images]
        keyword_in_alt_text = [self.lawyer_type in el.lower() for el in alt_text]
        
        # Combine 
        tag_info = {'title': title, 'key_in_title': keyword_in_title, 
                    'h1_count': h1_count, 'h1_text': h1_text, 'key_in_h1': keyword_in_h1, 
                    'h2_count': h2_count, 'h2_text': h2_text, 'key_in_h2': keyword_in_h2, 
                    'h3_count': h3_count, 'h3_text': h3_text, 'key_in_h3': keyword_in_h3, 
                    'image_count': image_count, 'image_alt_text': alt_text, 
                    'key_in_alt_text': keyword_in_alt_text
                   }
        
        return tag_info
      
    def get_all_info(self): 
        
        info = {'url':  self.url}
        
        # Word Counts 
        info.update(self.get_word_counts())
        
        # Keyword counts 
        keyword_count_info = self.get_keyword_counts()
        info.update(keyword_count_info)
        
        # Tags
        tag_info = self.get_tag_info()
        info.update(tag_info)
        
        # Check if wordpress site
        wp_script_tags = self.soup.find_all('link', {'href': re.compile(r'wp-content')})
        info.update({'is_wordpress': bool(len(wp_script_tags) > 0)})
        
        # Check for table of contents
        info.update({'has_table_of_contents': bool('table of contents' in self.body_text)})
        
        return info

#### Run Scraper

In [4]:
# Create keyword list 
type_list = ['Personal Injury','Car Accident', 'Motorcycle Accident', 
          'Truck Accident', 'Medical Malpractice',  'Wrongful Death']
city_list = ['Los Angeles', 'New York', 'Miami', 'Washington DC', 'Denver', 'Chicago']
# keyword_list = [f'{k} Lawyer {city}' for k in type_list for city in city_list]

In [34]:
# Scraper Config
# SECRET_KEY = "AIzaSyDjwQM14wUBM40e5xtL7Df3Qe4my03iuTA"
# SECRET_KEY = "AIzaSyBXIgkYyhxNqvEqWndStUlffrQV8EsK-fo"
# SECRET_KEY = 'AIzaSyBg9Q9dVyr1I7kGebGnMH6Ad6Lcm9MnB70'
# SECRET_KEY = 'AIzaSyD46wJwuzLjTN5ygFk0B20mLi03eC1RsIg'
# SECRET_KEY = 'AIzaSyDjwQM14wUBM40e5xtL7Df3Qe4my03iuTA'
# SECRET_KEY = 'AIzaSyAKs4tgD2y3i-RDWSrmu0wO3J93_MVMiy4'
SECRET_KEY = 'AIzaSyDixV59gyiiysxc1sadDXO9W_3WSQS8Qzs'
CSE_ID = "e340b4c39a82947c4"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

skip_links = ['yelp.com', 'forbes.com', 'expertise.com', 'findlaw.com', 
              'superlawyers.com', 'martindale.com', 'justia.com', 'indeed.com']

In [45]:
# Main scraping loop
for lawyer_type in type_list[5:6]: 
    for city in city_list: 
        
        # Generate keyword
        keyword = lawyer_type + ' Lawyer ' + city
         
        # Get list of searchable results 
        scraper = Scraper(SECRET_KEY, CSE_ID, HEADERS)
        result_list = scraper.search(keyword, skip_links = skip_links)
        
        # Iterate throguh each result
        data = []
        for idx, result in enumerate(result_list): 
            url = result['link']
            try: 
                res = SearchResult(scraper = scraper, url = url, city = city, lawyer_type = lawyer_type)
                res_vals = res.get_all_info()
                data.append(res_vals)
                if idx % 10 == 0: 
                    print(idx)
            except: 
                continue
        
        # Create dataframe
        df = pd.DataFrame(data)
        df = df.reset_index(drop = False).rename(columns = {'index': 'rank'})
        df.insert(0, 'keyword', keyword)
        df.to_csv(keyword.lower().replace(" ", "_") + ".csv", index = False)

        print(f'-- {keyword} COMPLETE -- ')

#### Moz Api Connection

In [52]:
def divide_chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [202]:
city_list

['Los Angeles', 'New York', 'Miami', 'Washington DC', 'Denver', 'Chicago']

In [481]:
# Load in df
FILE_PATH = 'data/medical_malpractice/medical_malpractice_lawyer_los_angeles.csv'
df = pd.read_csv(FILE_PATH)
url_list = df['url'].tolist()
url_list_chunked = list(divide_chunks(url_list, 50))

In [482]:
# Config 
MOZ_API_ID = "mozscape-a618b377a7"
MOZ_SECRET_KEY = "c396802e89a0e12ce55c0839b96151d0"
auth = (MOZ_API_ID, MOZ_SECRET_KEY)

# Request params and auth
url = 'https://lsapi.seomoz.com/v2/url_metrics'

# Iterate through each chunk and create df 
chunk_df = pd.DataFrame()
for URL_CHUNK in url_list_chunked: 

    # Make request
    data = '{"targets":' + json.dumps(URL_CHUNK) + '}'
    req = requests.post(url, data = data, auth=auth)
    res = req.json()['results']
    
    # Extract DA, PA, Spam, backlinks, no follow backlinks
    keys = ['domain_authority', 'page_authority', 'spam_score', 
            'pages_to_root_domain', 'nofollow_pages_to_root_domain']
    d = [{k:v for k,v in item.items() if k in keys} for item in res]
    chunk_df = pd.concat([chunk_df, pd.DataFrame(d)])

In [483]:
drop_cols = ['domain_authority', 'page_authority', 'spam_score']
df = df.drop(drop_cols, axis = 1)
combined_df = pd.concat([df.reset_index(drop=True), chunk_df.reset_index(drop=True)], axis = 1)

front_cols = ['keyword', 'rank', 'url', 'domain_authority', 'page_authority', 'spam_score', 'pages_to_root_domain', 
             'nofollow_pages_to_root_domain']
cols = front_cols + [col for col in combined_df if col not in front_cols]
combined_df[cols].to_csv(FILE_PATH, index = False)