# Web Scraping the Daily Mail Archives with Selenium Firefox

Author: Adil Khan

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
import os
import re
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer



### Create browser profile

In [2]:
## Get profile class
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
## get the Firefox profile object
firefoxProfile = FirefoxProfile()
## Disable CSS
firefoxProfile.set_preference('permissions.default.stylesheet', 2)
## Disable images
firefoxProfile.set_preference('permissions.default.image', 2)
## Disable JavaScript
firefoxProfile.set_preference('javascript.enabled', False)
## Disable Flash
firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so','false')
## Adblock path
adblock ='/Users/Adil/Library/Application Support/Firefox/Profiles/3if0ne0t.default/extensions/{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}.xpi'

### Define functions

In [None]:

genreReg = re.compile(r'(http://www.dailymail.co.uk/)(\w+)')


def openbrowser(url):
    global browser
    browser = webdriver.Firefox(firefoxProfile)
    #browser.install_addon(adblock, temporary=True)
    #time.sleep(10)
    #browser.switch_to_window(browser.window_handles[0])
    browser.get(url)

    
def sentiment(text):   
    sid = SentimentIntensityAnalyzer()
    return [sid.polarity_scores(sentence)['compound'] for sentence in text]


def get_titles():
    global titles, num_titles
    x=browser.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/ul[2]')
    titles=x.text.split('\n')
    num_titles = len(titles)
    

def extract():
    global urls, genres, sentiments
# Get list of article urls
    urls = [browser.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/ul[2]/li['+str(i+1)+']/a').get_attribute('href') for i in range(num_titles)]

# Extract genre
    genres = [genreReg.search(urls[i]).group(2) for i in range(len(urls))]

# Perform Sentiment Analysis
    sentiments = sentiment(titles)
    
def save():
    df.to_csv('DailyMail2016.csv')

### Extract Data

In [None]:
df=pd.DataFrame({'Title':['blank'],'Sentiment':None, 'Genre':None, 'Date':None})

for year in [2016]:
    for month in tqdm(range(1,13)):
        for day in tqdm(range(1,32)):
            date  = str(year)+'-'+str(month).zfill(2)+'-'+str(day).zfill(2)
            path = 'http://www.dailymail.co.uk/home/sitemaparchive/day_'+str(year)+str(month).zfill(2)+str(day).zfill(2)+'.html'
            try:
                openbrowser(path)
            except:
                print('Loading error on:\t',date)
                break

            get_titles()
            extract()
            browser.close()
            df_temp = pd.DataFrame({'Title':titles,'Sentiment':sentiments, 'Genre':genres, 'Date':date})
            df=pd.concat([df,df_temp])
            save()


