# Search results scraping

Getting the first few search results for a topic. Links have been changed for anonymity purposes.

# Import

In [1]:
import pandas as pd
import numpy as np
import time
from selenium.webdriver.common.keys import Keys
import re

# Pandas display options
pd.set_option('display.max_columns',6000)
pd.set_option('display.max_rows',6000)

# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def add_data_from_search_element(el):
    """Add data from  search element.
    
    el: element."""
    
    out_dict = dict(Title = el.find_element_by_class_name('st').text,
            Website = el.find_elements_by_class_name("iUh30")[0].text,
            Full_URL = el.find_elements_by_class_name("r [href]")[0].get_attribute('href'))
    try:
        out_dict['Date'] = el.find_element_by_class_name("f").text
    except:
        out_dict['Date'] = ''

    return out_dict

In [3]:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://search.gr")

In [4]:
## Choose and indicative phrase to search
phrase = '"οι καταναλωτές μπορούν να αναζητήσουν το αγαπημένο τους κατάστημα, να ανακαλύψουν τις ειδικά διαμορφωμένες προσφορές της Coca-Cola"'

# Conduct search

https://selenium-python.readthedocs.io/locating-elements.html

In [5]:
el = driver.find_element_by_class_name('gf.gfi')

In [6]:
el.send_keys(phrase)
time.sleep(3)
el.send_keys(Keys.RETURN)
time.sleep(3)

# Find Elements from the publications
* Find all element titles (class name = LC20lb)
* Find all element URLS
* Find all element dates (if appicable)

Load all results (including similar ones). Then loop through the first 2 pages to get the results.

In [8]:
els = driver.find_elements_by_class_name('card-section')
els[0].find_element_by_partial_link_text('omitted results').click()
time.sleep(3)

elements_list = []

for i in range(0,2):        
    elements = driver.find_elements_by_class_name('g')
    el_list_i = [add_data_from_search_element(el)for el in elements]

    elements_list += el_list_i


    #find the links for the next page
    driver.find_elements_by_class_name("S1aj.bCSr")[-1].click()
    time.sleep(3)

# Preprocess data

Bring the data in a readable format.

## Make series

In [9]:
publications = pd.Series(elements_list)

In [10]:
publications = publications.apply(lambda x: pd.Series(x))

In [11]:
publications['Website'] = publications['Website'].apply(lambda x: re.split(' › ',x)[0])

In [12]:
publications.shape

(20, 4)

## Drop duplicate links

In [13]:
publications.drop_duplicates(subset = ['Full_URL'],inplace=True)

In [14]:
publications.shape

(20, 4)

## Check which come from facebook

In [15]:
publications['Facebook'] = publications['Website'].str.contains('facebook')

## Get publication date

In [16]:
publications.loc[publications['Date'].str.contains('Rating'),'Date'] = ''

In [17]:
from datetime import datetime, date, timedelta

In [18]:
publications.loc[publications['Date'].str.contains('days'),'Date'] =\
            publications.loc[publications['Date'].str.contains('days'),'Date'].apply(lambda x: (date.today() - \
                                                                         timedelta(int(re.split(' days',x)[0]))).\
                                                                         strftime('%b %d, %Y'))

In [19]:
publications['Date'] = publications['Date'].replace('',np.nan)

In [20]:
for i in publications.index:
    try:
        publications.loc[i,'Date'] = datetime.strptime(re.sub(' -','',publications.loc[i,'Date']),'%b %d, %Y')
    except:
        continue

## Write to file

In [21]:
publications.head()

Unnamed: 0,Title,Website,Full_URL,Date,Facebook
0,"Jul 11, 2020 - ... οι καταναλωτές μπορούν να α...",grillmagazine.gr,https://grillmagazine.gr/2020/07/12/%CE%B7-coc...,2020-07-11 00:00:00,False
1,"Jul 8, 2020 - ... οι καταναλωτές μπορούν να αν...",www.liberal.gr,https://www.liberal.gr/economy/i-coca-colatria...,2020-07-08 00:00:00,False
2,"Jul 9, 2020 - ... οι καταναλωτές μπορούν να αν...",www.forin.gr,https://www.forin.gr/articles/article/36249/co...,2020-07-09 00:00:00,False
3,"Jul 8, 2020 - ... οι καταναλωτές μπορούν να αν...",www.documentonews.gr,https://www.documentonews.gr/article/h-coca-co...,2020-07-08 00:00:00,False
4,"Jul 9, 2020 - ... οι καταναλωτές μπορούν να αν...",www.newmoney.gr,https://www.newmoney.gr/roh/palmos-oikonomias/...,2020-07-09 00:00:00,False


In [21]:
publications.to_excel('Publications.xlsx')