___
## __Analysing Web Pages Using Beautiful Soup__

- The second part of this code will be analysing the webpages returned from scrapy
- Using the library Beautiful soup we can scrape the whole page and search for specific words on the webpage

### __Process__
__1) Scrape Webpage__
- Use the library BeautifulSoup to scrape a webpage

__2) Clean Data__
- Remove HTML code, non-letters and change all to lower case

__3) Analyse Webpages__
- Create a function that flags if a particular word has been mentioned on the webpage 
- This enables us to see if multiple words have been mentioned on one webpage
___

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import urllib.request
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\robert.lowe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# Example of using BeautifulSoup to scrape a webpage
site= "https://www.flyertalk.com/articles/love-the-smell-of-airports-you-can-spray-your-home-with-a-special-airport-fragrance.html"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
#print(soup)

In [41]:
# Create a simple function to scrape a webpage
def scraper(site):
    hdr = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
    req = Request(site,headers=hdr)
    page = urlopen(req)
    return BeautifulSoup(page)
    

In [42]:
# Open the csv file created from scrapy
df = pd.read_csv('C:/Users/robert.lowe/wordlist/hotel_info_sample.csv')

_____
#### __Sample__
- For examples purposes, create a sample set of data (100 rows)
___

In [43]:
df = df.head(100)

In [45]:
df.head()

Unnamed: 0,word,site
0,phone,https://www.flyertalk.com
1,phone,https://www.flyertalk.com/articles/love-the-sm...
2,phone,https://www.flyertalk.com/articles/author/jacq...
3,phone,https://www.flyertalk.com/articles/category/pa...
4,phone,https://www.flyertalk.com/articles/author/anya...


In [48]:
# Example link
x = df.iloc[1, -1]
x

'https://www.flyertalk.com/articles/love-the-smell-of-airports-you-can-spray-your-home-with-a-special-airport-fragrance.html'

____
__1) Scrape Webpage__
- Use the library BeautifulSoup to scrape a webpage
____

In [51]:
# Apply scraper function on all the sites
df['website_info'] = df.site.apply(scraper)

In [52]:
df.head()

Unnamed: 0,word,site,website_info
0,phone,https://www.flyertalk.com,"<!DOCTYPE html> <html lang=""en-US""> <head> <me..."
1,phone,https://www.flyertalk.com/articles/love-the-sm...,"<!DOCTYPE html> <html lang=""en-US""> <head> <me..."
2,phone,https://www.flyertalk.com/articles/author/jacq...,"<!DOCTYPE html> <html lang=""en-US""> <head> <me..."
3,phone,https://www.flyertalk.com/articles/category/pa...,"<!DOCTYPE html> <html lang=""en-US""> <head> <me..."
4,phone,https://www.flyertalk.com/articles/author/anya...,"<!DOCTYPE html> <html lang=""en-US""> <head> <me..."


In [53]:
df['website_info'] = df['website_info'].astype(str)

___
__2) Clean Data__
- Remove HTML code, non-letters and change all to lower case
___

In [55]:
def clean_data( raw_review ):
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                             
    #
    return words

In [56]:
# apply clean_data function
df['Clean_text'] = df.website_info.apply(clean_data)

In [57]:
df.head()

Unnamed: 0,word,site,website_info,Clean_text
0,phone,https://www.flyertalk.com,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...
1,phone,https://www.flyertalk.com/articles/love-the-sm...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...
2,phone,https://www.flyertalk.com/articles/author/jacq...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...
3,phone,https://www.flyertalk.com/articles/category/pa...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...
4,phone,https://www.flyertalk.com/articles/author/anya...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...


___

__3) Analyse Webpages__
- Create a function that flags if a particular word has been mentioned on the webpage 
- This enables us to see if multiple words have been mentioned on one webpage
___

In [1]:
def Reservation(str):
    words = str.split()
    counts = 0
    for word in words:
        if word == 'reservation':
            counts += 1
        else:
            counts == 0
    return counts

In [2]:
def Booking(str):
    words = str.split()
    counts = 0
    for word in words:
        if word == 'booking' or word == 'booked':
            counts += 1
        else:
            counts == 0
    return counts

In [60]:
df['Reservation_flag'] = df['Clean_text'].apply(Reservation)
df['Booking_flag'] = df['Clean_text'].apply(Booking)

In [61]:
df['Reservation_flag'] = np.where(((df['Reservation_flag'] >= 1)), 1, 0)
df['Booking_flag'] = np.where(((df['Booking_flag'] >= 1)), 1, 0)

In [62]:
df.head()

Unnamed: 0,word,site,website_info,Clean_text,Reservation_flag,Booking_flag
0,phone,https://www.flyertalk.com,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...,0,0
1,phone,https://www.flyertalk.com/articles/love-the-sm...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...,0,0
2,phone,https://www.flyertalk.com/articles/author/jacq...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...,0,0
3,phone,https://www.flyertalk.com/articles/category/pa...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...,0,1
4,phone,https://www.flyertalk.com/articles/author/anya...,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",doctype html html lang en us head meta charset...,0,1


In [64]:
df = df.drop(['website_info'], axis = 1)

In [65]:
df.to_csv('Phone_booking_project.csv')