## Social Media Analytics

# Webscraping Project


##### Felix Funes 20220306 | Paula Catalan 20221048 | Efstathia Styliagkatzi 20220078 | Alisson Tapia 20221156 | S M Abrar Hossain Asif 20220223




### Step 1: Load packages and do the initializations

In [108]:
# Load libraries
import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
import time
import re
from datetime import datetime, date, timedelta
import requests
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\madel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [109]:
# Allow not verified SSL (Secure Socket Layer) certificates to be opened
ssl._create_default_https_context = ssl._create_unverified_context

In [110]:
# Get Firefox options (configurations)
options = Options()

In [111]:
# Load the list of the pages to read the content
reviews_to_scrape = pd.read_excel("iphone-reviews-to-scrape.xlsx", sheet_name="Sheet1", index_col="ID", engine='openpyxl')


In [112]:
# Create an empty dataframe for the resuls
iphone_reviews = pd.DataFrame({'device': pd.Series([], dtype='string'),
                             'user': pd.Series([], dtype='string'),
                             'rating': pd.Series([], dtype='float'),
                             'text': pd.Series([], dtype='string')
                             })

### Step 2: Functions to use in the Main Loop

In [113]:
# Open page and read HTML
def openPageReadHTML(url):
    # Create a Firefox profile with permissions disabled
    firefox_options = Options()
    firefox_options.set_preference('geo.enabled', False)
    firefox_options.set_preference('geo.provider.network.url', '')
    firefox_options.set_preference('dom.webnotifications.enabled', False)
    firefox_options.set_preference('dom.disable_open_during_load', False)

    # Launch Firefox with the custom profile
    browser = webdriver.Firefox(options=firefox_options)
    browser.get(url)
    time.sleep(1)

    # If there is a privacy pop-up, click the OK button
    privacy_button = browser.find_elements(By.CLASS_NAME,"us-link")
    if len(privacy_button)>0:
        browser.execute_script("arguments[0].click()", privacy_button[0])
        time.sleep(0.5)

    # Read the content and close the browser
    html_source = browser.page_source  
    browser.quit()

    # Transform the html into a BeautifulSoup object
    soupObj = BeautifulSoup(html_source) 

    return soupObj


In [114]:
# Process each page
def processPage(soupObj, ID, extractedDF):   

    # Read reviews
    reviews = soupObj.find_all("li", class_="review-item")

    # Loop thru each review
    for i in range(0,len(reviews)):

        # Get Rating
        rating = reviews[i].select_one("p[class*=visually-hidden]")
        if rating:
            reviewRating = rating.text.strip()[6]

        # Get User
        user = reviews[i].select_one("div[class*=ugc-author]")
        if user:
            user = user.text.strip()

        # Get Review Text
        reviewText = reviews[i].select_one("div[class=ugc-review-body]")
        if reviewText:
            reviewText = reviewText.text.strip()

        # Get sentiment of the review
        # Create a SentimentIntensityAnalyzer object
        sid = SentimentIntensityAnalyzer()
        # Calculate the sentiment scores for the review
        scores = sid.polarity_scores(reviewText)

        # Determine the overall sentiment based on the compound score
        if scores['compound'] > 0.05:
            sentiment = 'Positive'
        elif scores['compound'] < -0.05:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'
            

        # Update extracted reviews dataframe
        tDF = pd.DataFrame({'device': [ID],
                             'user': [user],
                             'rating': [reviewRating],
                             'text': [reviewText],
                            'sentiment_score': [scores],
                             'sentiment': [sentiment]
                             })
        extractedDF = pd.concat([extractedDF,tDF],ignore_index=True)
        
     # Return the resulting dataframe
    return extractedDF


### Step 3: Main loop

In [116]:
# Loop for all pages
for index, row in reviews_to_scrape.iterrows():

    # Present feedback on which page is being processed
    print("Processing ", index)

    # Reset counter per page
    reviewsExtracted = 0    

    # Loop until it extracts all available reviews
    while True:

        # Define URL to use based on the number of reviews extracted so far
        urlToUse = row['URL']
        if reviewsExtracted > 0:
            repText = "-Reviews-or" + str(reviewsExtracted) + "-"
            urlToUse = urlToUse.replace("-Reviews-", repText, 1)

        # Open and read the web page content
        soup = openPageReadHTML(urlToUse)

        # Process web page
        iphone_reviews = processPage(soup, index, iphone_reviews)

        # Update counter
        reviewsExtracted += 100 if reviewsExtracted + 100 <= len(iphone_reviews) else len(iphone_reviews) - reviewsExtracted

        # Present feedback on the number of extracted reviews
        print("Extracted ", reviewsExtracted, "/", len(iphone_reviews))

        # Exit loop if all available reviews have been extracted
        if reviewsExtracted == len(iphone_reviews):
            break



Processing  Apple - iPhone 14 128GB - Midnight (Verizon)
Extracted  80 / 80
Processing  Apple - iPhone 14 128GB - Midnight (AT&T)
Extracted  100 / 100
Processing  Apple - iPhone 14 256GB - Midnight (AT&T)
Extracted  100 / 120
Extracted  140 / 140
Processing  Apple - iPhone 14 256GB - Purple (Verizon)
Extracted  100 / 160
Extracted  180 / 180
Processing  Apple - iPhone 14 512GB - Midnight (AT&T)
Extracted  100 / 182
Extracted  184 / 184
Processing  Apple - iPhone 14 512GB - Yellow (Verizon)
Extracted  100 / 185
Extracted  186 / 186
Processing  Apple - iPhone 14 128GB - Purple (T-Mobile)
Extracted  100 / 206
Extracted  200 / 226
Extracted  246 / 246


In [117]:
iphone_reviews = iphone_reviews.astype(str)
sum(iphone_reviews.duplicated())

142

### Step 4: The Final Excel File

In [120]:
iphone_reviews

Unnamed: 0,device,user,rating,text,Sentiment_score,sentiment
0,Apple - iPhone 14 128GB - Midnight (Verizon),BigG,5,Apple makes the best cellphone on the market h...,"{'neg': 0.0, 'neu': 0.682, 'pos': 0.318, 'comp...",Positive
1,Apple - iPhone 14 128GB - Midnight (Verizon),Jp44087,5,"Ease of use, good battery life, 128gb fits me ...","{'neg': 0.0, 'neu': 0.526, 'pos': 0.474, 'comp...",Positive
2,Apple - iPhone 14 128GB - Midnight (Verizon),GamerDadLife,5,Love it works great and the red color is the m...,"{'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'comp...",Positive
3,Apple - iPhone 14 128GB - Midnight (Verizon),Ohio,5,The perfect iPhone! this thing is amazing for ...,"{'neg': 0.0, 'neu': 0.599, 'pos': 0.401, 'comp...",Positive
4,Apple - iPhone 14 128GB - Midnight (Verizon),ChristynM,5,"So far, so good. I used android since I got a ...","{'neg': 0.109, 'neu': 0.647, 'pos': 0.244, 'co...",Positive
...,...,...,...,...,...,...
241,Apple - iPhone 14 128GB - Purple (T-Mobile),CMSnewtoy,5,Love my new I-phone 14\nAllen did an EXCELLENT...,"{'neg': 0.0, 'neu': 0.688, 'pos': 0.312, 'comp...",Positive
242,Apple - iPhone 14 128GB - Purple (T-Mobile),Bree,5,Love my new phone! You able to do so much more...,"{'neg': 0.0, 'neu': 0.757, 'pos': 0.243, 'comp...",Positive
243,Apple - iPhone 14 128GB - Purple (T-Mobile),DeborahB,5,Great phone after I learned how to use it beca...,"{'neg': 0.0, 'neu': 0.76, 'pos': 0.24, 'compou...",Positive
244,Apple - iPhone 14 128GB - Purple (T-Mobile),Priceright,5,Never disappointed with Apple Products- awesom...,"{'neg': 0.089, 'neu': 0.665, 'pos': 0.246, 'co...",Positive


In [119]:
# Save the extracted reviews data frame to an Excel file
iphone_reviews.to_excel("ExtractedReviewsData1.xlsx")