# Web Scraping the Daily Mail Archives with Selenium Firefox

Author: Adil Khan

In [141]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
import os
import re
import calendar
import pandas as pd
from IPython.display import display, HTML
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Create browser profile

In [142]:
## Get profile class
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
## get the Firefox profile object
firefoxProfile = FirefoxProfile()
## Disable CSS
firefoxProfile.set_preference('permissions.default.stylesheet', 2)
## Disable images
firefoxProfile.set_preference('permissions.default.image', 2)
## Disable JavaScript
firefoxProfile.set_preference('javascript.enabled', False)
## Disable Flash
firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so','false')
## Adblock path
adblock ='/Users/Adil/Library/Application Support/Firefox/Profiles/3if0ne0t.default/extensions/{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}.xpi'

### Define filename and functions

In [143]:
filename = 'DailyMail2018.csv'

genreReg = re.compile(r'(https://www.dailymail.co.uk/)(\w+)')

#Open the browser. Remove hash tags if adverts are a problem
def openbrowser(url):
    global browser
    browser = webdriver.Firefox(firefoxProfile)
    #browser.install_addon(adblock, temporary=True)
    #time.sleep(10)
    #browser.switch_to_window(browser.window_handles[0])
    browser.get(url)

# Setiment analysis returns a pos, neg, neutral and compound sentiment. Returns a list of
# compound sentiments for a list of sentences
def sentiment(text):   
    sid = SentimentIntensityAnalyzer()
    return [sid.polarity_scores(sentence)['compound'] for sentence in text]

# Parses the daily mail archive page for news article headlines
def get_titles():
    global titles, num_titles
    x=browser.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/ul[2]')
    titles=x.text.split('\n')
    num_titles = len(titles)
    
def extract():
    global urls, genres, sentiments
# Get list of article urls
    urls = [browser.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/ul[2]/li['+str(i+1)+']/a').get_attribute('href') for i in range(num_titles)]

# Extract genre
    genres = [genreReg.search(urls[i]).group(2) for i in range(len(urls))]

# Perform Sentiment Analysis
    sentiments = sentiment(titles)
    
    
def save():
    df.to_csv(filename)
    
def log(date,time,status):
    global runlog
    templog = pd.DataFrame({"date":[date],"time":[time],"status":[status]})
    print(templog.to_string(header=False,index=False))
    runlog = pd.concat([runlog,templog])
    

### Create blank database
Change the year, month and day to the desired interval. Error exception handling means that you don't need to worry about non-existent dates, e.g 31st February. 

Typically, >1000 articles are published daily. The scraper saves every day's worth of headlines to a csv, defined in save().

If creating a blank database, be sure to change the filename in save().

In [59]:
print('Enter "y" to clear dataframe in memory.')
if input()=='y':
    df=pd.DataFrame({'Title':['blank'],'Sentiment':None, 'Genre':None, 'Date':None})
    print('Dataframe in memory cleared.')
else:
    print('Dataframe in memory not cleared.')

Enter "y" to clear dataframe.
y
Dataframe cleared.


### Load dataset if exists
If a data set already exists then load it.

In [144]:
df=pd.read_csv(filename, encoding='ISO-8859-1')
df=df.drop('Unnamed: 0',axis=1)
df.tail()

Unnamed: 0,Date,Genre,Sentiment,Title
251627,27/05/18,sport,-0.9001,BUMBLE AT THE TEST: No hiding place for deject...
251628,27/05/18,sport,0.4215,Manchester United and Chelsea stars who featur...
251629,27/05/18,wires,0.0,"Conservative governor, liberal lawmakers clash..."
251630,27/05/18,sport,-0.7579,'We are seeing collapse after collapse... are ...
251631,27/05/18,wires,0.34,Motor racing-Ricciardo takes tense Monaco win ...


## Start logging

In [145]:
runlog = pd.DataFrame({"date":[],"time":[],"status":[]})


In [146]:
for year in [2018]:
    for month in (range(5,6)):
        for day in (range(28,32)):
            date  = str(year)+'-'+str(month).zfill(2)+'-'+str(day).zfill(2)
            path = 'http://www.dailymail.co.uk/home/sitemaparchive/day_'+str(year)+str(month).zfill(2)+str(day).zfill(2)+'.html'
            start = time.time()
            try:
                openbrowser(path)
            except:
                print('Loading error on:\t',date)
                log(date,start,None,'Loading error')                
                continue
            try:
                get_titles()
            except NoSuchElementException:
                print('NoSuchElementException on:\t',date)
                log(date,start,None,'NoSuchElementException')                
                browser.close()
                continue
            try:
                extract()
            except AttributeError:
                print('AttributeError on:\t',date)
                log(date,start,None,'AttributeError')
                browser.close()
                continue               
            browser.close()
            log(date,time.time()-start,'{} records'.format(len(titles)))
            df_temp = pd.DataFrame({'Title':titles,'Sentiment':sentiments, 'Genre':genres, 'Date':date})
            df=pd.concat([df,df_temp])
            save()
        print(calendar.month_name[month] + ' completed.')
        

2018-05-28  1426 records  41.808184
2018-05-29  1947 records  54.464499
2018-05-30  2000 records  55.123299
2018-05-31  2000 records  53.890962
May completed.
