In [1]:
# Imports 
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from itertools import product

import urllib
from requests_html import HTML
from requests_html import HTMLSession

#### Create GoogleScraper Object

In [2]:
# Config Information 
SECRET_KEY = "AIzaSyDjwQM14wUBM40e5xtL7Df3Qe4my03iuTA"
CSE_ID = "e340b4c39a82947c4"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

In [3]:
def count_indexed_pages(url):
    
    query = urllib.parse.quote_plus(url)


    # Get the response
    try:
        session = HTMLSession()
        response = session.get(u"https://www.google.com/search?q=site%3A" + url)

    except requests.exceptions.RequestException as e:
        print(e)

    # Parse response
    string = response.html.find("#result-stats", first=True).text
    indexed = int(string.split(' ')[1].replace(',',''))

    return indexed

In [4]:
class GoogleScraper: 
    
    def __init__(self, api_key: str, cse_id: str, headers: dict): 
        
        # Grab args
        self.api_key = api_key
        self.cse_id = cse_id
        self.headers = headers
        
    def search(self, keyword: str, skip_links: list = None): 
        
        """
        Run the google search and get list of top 70 results.
        -- 
        keyword: str -> phrase to search on google 
        skip_links: list -> list of links that we should skip such as 
        """
        
        # Setup service operator 
        service = build("customsearch", "v1", developerKey = self.api_key)
        
        # Make request 
        res_list = []
        for start in [1, 11, 21, 31, 41, 51, 61]:
            res = service.cse().list(q = keyword, cx = self.cse_id, start = start).execute()
            results = [{k:v for k,v in res.items() if k in ['title', 'link', 'snippet']} for res in res['items']]
            res_list.append(results)
            
        # Flatten List
        res_list = [item for sublist in res_list for item in sublist]
        
        # Remove all that contain a skip link
        res_list = [page for page in res_list if not any(substring in page['link'] for substring in skip_links)]
        
        
        return res_list

    def make_request(self, url): 
        
        """
        Makes the GET request and uses BS4 to parse it
        -- 
        url: string of the url to search
        """
        
        # Make request with proper headers 
        req = requests.get(url, headers = self.headers)
        
        # Parse via BS4
        soup = BeautifulSoup(req.content, 'html.parser')
        
        return soup
    
    def count_indexed_pages(url):
    
        query = urllib.parse.quote_plus(url)


        # Get the response
        try:
            session = HTMLSession()
            response = session.get(u"https://www.google.com/search?q=site%3A" + url)

        except requests.exceptions.RequestException as e:
            print(e)

        # Parse response
        string = response.html.find("#result-stats", first=True).text
        indexed = int(string.split(' ')[1].replace(',',''))

        return indexed

    def get_word_count_info(self, soup): 
        
        """
        Compute all the word count information we want from the parsed soup object. 
        Note that it removes any text in a header tag or footer tag
        """
    
        # Grab body text 
        body_text = re.sub(r'\s+', ' ', soup.find('body').text).rstrip().lstrip()
        body_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', body_text).lower()
        self.body_text = body_text.replace('\n','')

        # Grab header text
        try: 
            header = soup.find('header')
            header_text = re.sub(r'\s+', ' ', header.text).rstrip().lstrip()
            header_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', header_text).lower()
            header_text = header_text.replace('\n','')
            header_words = len(header_text.split())
        except: 
            header_text = ''
            header_words = 0


        # Grab footer text 
        try: 
            footer = soup.find('footer')
            footer_text = re.sub(r'\s+', ' ', footer.text).rstrip().lstrip()
            footer_text = re.sub(r'(?<=[.,?!])(?=[^\s])', r' ', footer_text).lower()
            footer_text = footer_text.replace('\n','')
            footer_words = len(footer_text.split())
        except: 
            footer_text = ''
            footer_words = 0


        # Remove header/footer text
        text = body_text.replace(header_text, '')
        text = text.replace(footer_text, '').rstrip().lstrip()

        # Compute word counts
        total_words = len(text.split())

        # Compute keyword frequency
        keyword_freq = len(re.findall(r'car accident', text))
        lawyer_freq = len(re.findall(r'lawyer|attorney', text))
        city_freq = len(re.findall('los angeles', body_text))

        # Combine the keyword into single word 
        text = text.replace('car accident', 'caraccident')

        # Find first spot of keyword
        first_keyword_spot = text.split().index('caraccident') + 1 if 'caraccident' in text.split() else 'NA'
        keyword_in_200 = first_keyword_spot <= 200 if 'caraccident' in text.split() else False

        word_count_info = {'total_words': total_words, 
                           'header_words': header_words, 
                           'footer_words': footer_words, 
                           'keyword_count': keyword_freq, 
                           'lawyer_attorney_count': lawyer_freq, 
                       'city_count': city_freq, 
                       'first_keyword_spot': first_keyword_spot, 
                       'keyword_in_200': keyword_in_200}
    
    
        return word_count_info
    
    def get_alt(self, img): 
        try: 
            return img['alt']
        except: 
            return 'NO ALT' 
    
    def get_tag_info(self, soup): 
    
        # Title tag info 
        title = soup.find('title').text
        keyword_in_title = 'car accident' in title.lower()

        # H1 info
        h1 = soup.find_all('h1')
        h1_count = len(h1)
        h1_text = [el.text for el in h1]
        keyword_in_h1 = ['car accident' in el.lower() for el in h1_text]

        # H2 info
        h2 = soup.find_all('h2')
        h2_count = len(h2)
        h2_text = [el.text for el in h2]
        keyword_in_h2 = ['car accident' in el.lower() for el in h2_text]

        # H3 info
        h3 = soup.find_all('h3')
        h3_count = len(h3)
        h3_text = [el.text for el in h3]
        keyword_in_h3 = ['car accident' in el.lower() for el in h3_text]

        # Image info
        images = soup.find_all('img')
        image_count = len(images)
        alt_text = [self.get_alt(img) for img in images]
        keyword_in_alt_text = ['car accident' in el.lower() for el in alt_text]

        # Combine 
        tag_info = {'title': title, 'key_in_title': keyword_in_title, 
                    'h1_count': h1_count, 'h1_text': h1_text, 'key_in_h1': keyword_in_h1, 
                    'h2_count': h2_count, 'h2_text': h2_text, 'key_in_h2': keyword_in_h2, 
                    'h3_count': h3_count, 'h3_text': h3_text, 'key_in_h3': keyword_in_h3, 
                    'image_count': image_count, 'image_alt_text': alt_text, 
                    'key_in_alt_text': keyword_in_alt_text
                   }

        return tag_info
    
    def get_link_info(self, soup): 
        
        # Get word count info 
        info = self.get_word_count_info(soup)
        
        # Get tag info
        tag_info = self.get_tag_info(soup)
        info.update(tag_info)
        
        # Check if wordpress site
        wp_script_tags = soup.find_all('link', {'href': re.compile(r'wp-content')})
        info.update({'is_wordpress': bool(len(wp_script_tags) > 0)})
        
        # Check for table of contents
        info.update({'has_table_of_contents': bool('table of contents' in self.body_text)})
        
        
        
        return info

#### Grab Links

In [1]:
k_list = ['Personal Injury Lawyer',
          'Car Accident Lawyer', 
          'Motorcycle Accident Lawyer', 
          'Truck Accident Lawyer', 
          'Medical Malpratice Lawyer', 
          'Wrongful Death Lawyer']
city_list = ['Los Angeles', 'New York', 'Miami', 'Washington DC', 'Denver', 'Chicago']
keyword_list = [f'{k} {city}' for k in k_list for city in city_list]

In [2]:
keyword_list

['Personal Injury Lawyer Los Angeles',
 'Personal Injury Lawyer New York',
 'Personal Injury Lawyer Miami',
 'Personal Injury Lawyer Washington DC',
 'Personal Injury Lawyer Denver',
 'Personal Injury Lawyer Chicago',
 'Car Accident Lawyer Los Angeles',
 'Car Accident Lawyer New York',
 'Car Accident Lawyer Miami',
 'Car Accident Lawyer Washington DC',
 'Car Accident Lawyer Denver',
 'Car Accident Lawyer Chicago',
 'Motorcycle Accident Lawyer Los Angeles',
 'Motorcycle Accident Lawyer New York',
 'Motorcycle Accident Lawyer Miami',
 'Motorcycle Accident Lawyer Washington DC',
 'Motorcycle Accident Lawyer Denver',
 'Motorcycle Accident Lawyer Chicago',
 'Truck Accident Lawyer Los Angeles',
 'Truck Accident Lawyer New York',
 'Truck Accident Lawyer Miami',
 'Truck Accident Lawyer Washington DC',
 'Truck Accident Lawyer Denver',
 'Truck Accident Lawyer Chicago',
 'Medical Malpratice Lawyer Los Angeles',
 'Medical Malpratice Lawyer New York',
 'Medical Malpratice Lawyer Miami',
 'Medical M

In [6]:
# Init scraper
scraper = GoogleScraper(SECRET_KEY, CSE_ID, HEADERS)
skip_links = ['yelp.com', 'forbes.com', 
              'expertise.com', 'findlaw.com', 
              'superlawyers.com', 'martindale.com', 
              'justia.com', ]

for keyword in keyword_list[12:]: 
    
    # Get top 70 links
    res_list = scraper.search(keyword, skip_links)
    
    data = []
    for idx, res in enumerate(res_list[:10]): 

        # Make the get request
        soup = scraper.make_request(res['link'])

        # Parse information
        info = scraper.get_link_info(soup)
        info.update({'rank': idx + 1, 'url': res['link']})

        data.append(info)

        print(f"Page {idx+1} of {10} complete")
        
    # Convert to df
    df = pd.DataFrame(data)
    df['keyword'] = keyword
    df['performer'] = df['rank'] <= 10
    cols = ['keyword', 'rank', 'performer', 'url']
    df = df[cols + [col for col in df if col not in cols]]
    df.to_csv(f'{keyword}.csv')
    print(f'--{keyword} COMPLETE--')

Page 1 of 10 complete
Page 2 of 10 complete
Page 3 of 10 complete
Page 4 of 10 complete
Page 5 of 10 complete
Page 6 of 10 complete
Page 7 of 10 complete
Page 8 of 10 complete
Page 9 of 10 complete
Page 10 of 10 complete
--Motorcycle Accident Lawyer Los Angeles COMPLETE--
Page 1 of 10 complete
Page 2 of 10 complete
Page 3 of 10 complete
Page 4 of 10 complete
Page 5 of 10 complete
Page 6 of 10 complete
Page 7 of 10 complete
Page 8 of 10 complete
Page 9 of 10 complete
Page 10 of 10 complete
--Motorcycle Accident Lawyer New York COMPLETE--
Page 1 of 10 complete
Page 2 of 10 complete
Page 3 of 10 complete
Page 4 of 10 complete
Page 5 of 10 complete
Page 6 of 10 complete
Page 7 of 10 complete
Page 8 of 10 complete
Page 9 of 10 complete
Page 10 of 10 complete
--Motorcycle Accident Lawyer Miami COMPLETE--
Page 1 of 10 complete
Page 2 of 10 complete
Page 3 of 10 complete
Page 4 of 10 complete
Page 5 of 10 complete
Page 6 of 10 complete
Page 7 of 10 complete
Page 8 of 10 complete
Page 9 of 10 c

AttributeError: 'NoneType' object has no attribute 'text'

In [25]:
len(city_list)

6

#### Extract Information

In [15]:
data = []
for idx, res in enumerate(res_list[:10]): 
    
    # Make the get request
    soup = scraper.make_request(res['link'])
    
    # Parse information
    info = scraper.get_link_info(soup)
    info.update({'rank': idx + 1, 'url': res['link']})
    
    data.append(info)
    
    print(f"Page {idx+1} of {10}")

Page 0 of 10
Page 1 of 10
Page 2 of 10
Page 3 of 10
Page 4 of 10
Page 5 of 10
Page 6 of 10
Page 7 of 10
Page 8 of 10
Page 9 of 10


In [17]:
df = pd.DataFrame(data)
df['keyword'] = keyword
df['performer'] = df['rank'] <= 10
cols = ['keyword', 'rank', 'performer', 'url']
df = df[cols + [col for col in df if col not in cols]]

In [10]:
data

NameError: name 'data' is not defined

In [288]:
df.to_excel('test_matador_scrape.xlsx', index = False)