# Web Scraping the Daily Mail Archives with Selenium Firefox

Author: Adil Khan

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import re
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer



### Create browser profile

In [2]:
## Get profile class
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
## get the Firefox profile object
firefoxProfile = FirefoxProfile()
## Disable CSS
firefoxProfile.set_preference('permissions.default.stylesheet', 2)
## Disable images
firefoxProfile.set_preference('permissions.default.image', 2)
## Disable JavaScript
firefoxProfile.set_preference('javascript.enabled', False)
## Disable Flash
firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so','false')
## Adblock path
adblock ='/Users/Adil/Library/Application Support/Firefox/Profiles/3if0ne0t.default/extensions/{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}.xpi'

### Input date

In [3]:
year ='2017'
month ='01'
day = '05'

path = 'http://www.dailymail.co.uk/home/sitemaparchive/day_'+year+month+day+'.html'

### Define functions

In [4]:
def openbrowser():
    global browser
    browser = webdriver.Firefox(firefoxProfile)
    browser.install_addon(adblock, temporary=True)
    time.sleep(10)
    browser.switch_to_window(browser.window_handles[0])
    browser.get(path)


def sentiment(text):   
    sid = SentimentIntensityAnalyzer()
    return [sid.polarity_scores(sentence)['compound'] for sentence in text]

numReg = re.compile(r'\d+')

name = 'Dailymail_'+ numReg.search(path).group() +'.csv'
def save():
    df.to_csv(name)

### Open browser

In [5]:
openbrowser()

### Get list of articles

In [6]:
x=browser.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/ul[2]')
titles=x.text.split('\n')
num_titles = len(titles)

### Create Database. Perform sentiment analysis

In [7]:
df = pd.DataFrame({'Title':titles,'Sentiment':sentiment(titles), 'Shares':None,'Comments':None, 'Genre':None, 'Author':None})

### Save dataframe to csv

In [None]:
save()

### Open link i, download metadata, go back

In [None]:
#Extract the first 1000 articles or all articles, whichever is lesser
for i in tqdm(range(min(1000,num_titles))):
    g,a,s,c = [None]*4

#Click on link i. If it is less than 15 characters, then pass
    try:
        browser.find_element_by_partial_link_text(titles[i][5:15]).click()
    except:
        pass

#Extract genre if available
    try:
        g = browser.find_element_by_xpath('//span[@class="link-wocc linkro-wocc"]').text
    except:
        g=None    
        
#Extract author name if available
    try:
        a = browser.find_element_by_class_name('author').text
    except:
        a=None    

#Extract share count if available
    try:
        shares = browser.find_element_by_class_name('share-count').text
        s=int(numReg.search(shares).group())
    except:
        s=None
#Extract comment count if available
    try:
        comments = browser.find_element_by_class_name('count-number').text
        c=int(numReg.search(comments).group())
    except:
        c=None
        
#Insert into dataframe
    df['Genre'].iloc[i]=g
    df['Author'].iloc[i]=a
    df['Shares'].iloc[i]=s
    df['Comments'].iloc[i]=c
    
#Go back or reload page if connection has failed
    try:
        browser.back()
    except:
        openbrowser()
        
#Save after every 25 iterations
    if i%25==0:
        save()
save()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
