In [1]:
import os 
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import datetime
from urllib.request import urlparse

In [2]:
output_path = os.path.join(os.path.dirname('__file__'), '..') + '/SERP/output/'

In [6]:
def _random_sleep(minimum=3, maximum=10, sd=1):
        """
        sets a random time to sleep between requests, drawing random values from an exponential distribution
        if no parameters are set, the values [3, 10, 1] have been pre-specified

        :param minimum: the min time of the timer <float>
        :param maximum: the max time of the timer <float>
        :param sd: the standard deviation <float>
        :return: time.sleep( value drawn )
        """
        value = minimum + sd * np.random.exponential()
        if value > maximum:
            value = maximum + np.random.uniform(0, 1)
        time.sleep(value)

In [8]:
def _make_request(url, method, **kwargs):
        """
        sends a request to Google and returns the html code as bs4 element

        :param url: the url to which the request will be sent
        :param method: the HTTP method ('get' or 'post')
        :param kwargs: any extra arguments passed to the request builder
        :return: bs4.element
        """
        if method == 'post':
            response = requests.post(url, **kwargs)
        else:
            response = requests.get(url, **kwargs)

        # check if the response contains html or text:
        if 'text' or 'html' in response.headers['Content-Type']:
            content = response.content
            response.close()
            return content
        else:
            # THIS IS A GOOD PLACE TO ADD A CATCH EXCEPTIONS!!
            response.close()

In [9]:
def _get_page_results(html):
    """
    Gets the html as input and returns a list of urls that are to be fund on the given page
    """
    url_list = []
    soup = BeautifulSoup(html, 'lxml')
    for tag in soup.findAll('div', attrs={'class':'g'}):
        try:
            raw_url = tag.find('a').get('href').split('url?q=')[1]
            real_url = urlparse(raw_url)[1]
            url_list.append(real_url)
        except:
            pass
    return(url_list)

In [10]:
def _search_me(keyword, url):
    url_list = []
    counter = 0
    while url not in url_list:
        search_url = 'https://www.google.com/search?q={}&start={}'.format(keyword, counter * 10)
        html = _make_request(search_url, method = 'get')
        for link in _get_page_results(html):
            url_list.append(link)
            if link == url:
                rank = len(url_list)
        print(len(url_list))
        _random_sleep()
        counter += 1
        print('page: {}'.format(counter))
    print('{} ranks number {} for {}'.format(url, rank, keyword))    
    return(url_list, rank)

In [11]:
search_term = 'how to buy property in Budapest'
target_site = 'www.budapestestate.com'


In [12]:
_search_me(search_term, target_site)

10
page: 1
20
page: 2
30
page: 3
www.budapestestate.com ranks number 29 for how to buy property in Budapest


(['www.globalpropertyguide.com',
  'www.budapestbylocals.com',
  'www.towerbudapest.com',
  'helpers.hu',
  'www.gatewayproperties.co.uk',
  'www.rightmove.co.uk',
  'propertiesinbudapest.com',
  'britishexpats.com',
  'www.engelvoelkers.com',
  'www.mybudapesthome.com',
  'www.portfolio.hu',
  'www.portfolio.hu',
  'www.expat.com',
  'www.tower-investments.com',
  'dh.hu',
  'realestate.hu',
  'www.nytimes.com',
  'www.hungarianhouses.com',
  'www.easyexpat.com',
  'www.capitalrealestate.hu',
  'www.capitalrealestate.hu',
  'clarkeandwhite.com',
  'www.thenational.ae',
  'www.realtor.com',
  'www.ft.com',
  'www.justlanded.com',
  'www.flottinvest.hu',
  'gurdinc.com',
  'www.budapestestate.com',
  'www.irishtimes.com'],
 29)