# Data Collection

Our objective is to collect the review data on a subsidiary company of Centrica, British Gas. The data collection is performed by the process of web scraping from the customer review website, https://uk.trustpilot.com.

In [1]:
# import libraries
from bs4 import BeautifulSoup
import urllib.request
from time import sleep
import pandas as pd
from datetime import datetime

In [2]:
# function to scrape Trustpilot for British Gas
def BG_scrape(address, page_num):
    # empty dataframes to be filled later
    full_df = pd.DataFrame()
    rating_df = pd.DataFrame()
    review_df = pd.DataFrame()
    date_df = pd.DataFrame()
    
    with urllib.request.urlopen(address) as url1:
        page1 = url1.read()
    soup1 = BeautifulSoup(page1, 'html.parser')

    for i in range(0, page_num):
        print(str(i + 1) + '/' + str(page_num) + ' pages parsed') # keep track of pages parsed
        sleep(2) # to not spam the server
        url = address + str(i + 1)
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page, 'html.parser')
        
        # finding all ratings
        rating_list = []
        rating_parent = soup.find_all('div', {'class':'review-info__header__verified'}) # only getting stars from reviews
        for ratings in rating_parent:
            rating = str(ratings.find_all('div', {'class': 'star-rating'}))
            rating_list.append(rating[37]) # the rating value is on index 37
        rating_df = rating_df.append(rating_list, ignore_index=True)
        
        # finding all reviews
        review_list = []
        review = soup.find_all(class_='review-info__body__text')
        for r in review:
            review_list.append(r.text.strip('\n'))
        review_df = review_df.append(review_list, ignore_index=True)
        
        # finding all dates
        date_list = []
        date_parent = soup.find_all('div', {'class': 'header__verified__date'}) # only dates from reviews
        for dates in date_parent:
            date = dates.find('time')
            date_list.append(date['datetime'])
        date_df = date_df.append(date_list, ignore_index=True)
        
        # combining all dataframes
        full_df = pd.concat([rating_df, review_df, date_df], axis=1, ignore_index=True)
        full_df.columns = ['rating', 'reviews', 'date']
    
    return full_df

The data on British Gas are collected from 3713 unique reviews (as of 08/07/2018) available at https://uk.trustpilot.com/review/www.britishgas.co.uk.

In [4]:
# scraping for British Gas from Trustpilot
BG_address = "https://uk.trustpilot.com/review/www.britishgas.co.uk?page="
# BG_page = 5 # testing for 5 pages
BG_page = 186

# time took for code to run
startTime = datetime.now()
BG_df = BG_scrape(BG_address, BG_page) 
print(datetime.now() - startTime) 

# saved into .csv file
BG_df.to_csv('BG.csv', index=False, encoding='utf-8')

1/186 pages parsed
2/186 pages parsed
3/186 pages parsed
4/186 pages parsed
5/186 pages parsed
6/186 pages parsed
7/186 pages parsed
8/186 pages parsed
9/186 pages parsed
10/186 pages parsed
11/186 pages parsed
12/186 pages parsed
13/186 pages parsed
14/186 pages parsed
15/186 pages parsed
16/186 pages parsed
17/186 pages parsed
18/186 pages parsed
19/186 pages parsed
20/186 pages parsed
21/186 pages parsed
22/186 pages parsed
23/186 pages parsed
24/186 pages parsed
25/186 pages parsed
26/186 pages parsed
27/186 pages parsed
28/186 pages parsed
29/186 pages parsed
30/186 pages parsed
31/186 pages parsed
32/186 pages parsed
33/186 pages parsed
34/186 pages parsed
35/186 pages parsed
36/186 pages parsed
37/186 pages parsed
38/186 pages parsed
39/186 pages parsed
40/186 pages parsed
41/186 pages parsed
42/186 pages parsed
43/186 pages parsed
44/186 pages parsed
45/186 pages parsed
46/186 pages parsed
47/186 pages parsed
48/186 pages parsed
49/186 pages parsed
50/186 pages parsed
51/186 pa

The data are then saved into a csv file for future usage and reference. The next step, pre-processing is performed in preprocessing.ipynb.