In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import time
import traceback
from retrying import retry

In [13]:
def fetch_webpage(url):
    """
    Fetches the content of a webpage using requests.
    
    Parameters:
        url (str): The URL of the webpage.
        
    Returns:
        str: The HTML content of the webpage.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch webpage. Status code: {response.status_code}")
        return None

def parse_html(html_content):
    """
    Parses HTML content using Beautiful Soup.
    
    Parameters:
        html_content (str): The HTML content to parse.
        
    Returns:
        BeautifulSoup: A BeautifulSoup object representing the parsed HTML.
    """
    return BeautifulSoup(html_content, 'html.parser')

In [6]:
MAX_PAGES=23
list_url = [f'https://www.airlinequality.com/airline-reviews/aer-lingus/page/{page}/?sortby=post_date%3ADesc&pagesize=100' for page in range(1, MAX_PAGES + 1)]

In [7]:
comments_data = pd.DataFrame(columns=['Date Published', 'Overall Rating', 'Passenger Country', 'Trip_verified', 'Comment title','Comment', 
                                       'Aircraft', 'Type Of Traveller', 'Seat Type', 'Origin', 'Destination' 'Date Flown', 
                                       'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Ground Service', 
                                       'Value For Money', 'Recommended'])
comments_data_list = [] 

class_to_label = {
    'aircraft': 'Aircraft',
    'type_of_traveller': 'Type Of Traveller',
    'cabin_flown': 'Seat Type',
    'route': 'Route',
    'date_flown': 'Date Flown',
    'seat_comfort': 'Seat Comfort',
    'cabin_staff_service': 'Cabin Staff Service',
    'food_and_beverages': 'Food & Beverages',
    'inflight_entertainment':'Inflight Entertainment',
    'ground_service': 'Ground Service',
    'wifi_and_connectivity':'Wifi & Connectivity',
    'value_for_money': 'Value For Money',
    'recommended': 'Recommended'
}

In [11]:
# Loop through each URL in the list of URLs
for url in list_url:
    # Fetch the webpage content
    html_content = fetch_webpage(url)
    
    # Proceed if the webpage content was successfully fetched
    if html_content:
        # Parse the HTML content
        soup = parse_html(html_content)
        
        # Find all comment elements (reviews) within 'article' tags
        comments = soup.find_all('article', itemprop='review')  # Retrieve only the first 5 comments
        
        # Process each comment found on the page
        for comment in comments:
            try:
                # Extract the publication date
                date_published = comment.find('meta', itemprop='datePublished')['content']
                
                # Retrieve the rating value, if available
                rating = comment.find('span', itemprop='ratingValue')
                rating = rating.text if rating else ''
                
                # Extract the comment header (title)
                text_header = comment.find('h2', class_='text_header').text
                
                # Retrieve sub-header text, which contains the user's country
                text_sub_header_text = comment.find('h3', class_='text_sub_header userStatusWrapper').get_text(strip=True)
                country = text_sub_header_text.split('(')[-1].split(')')[0]
                
                # Extract the main content of the review
                text_content = comment.find('div', class_='text_content', itemprop='reviewBody')
                
                # Determine whether the trip was verified or not
                verification = text_content.find('strong')
                verification = verification.text.strip() if verification else ''
                
                # Clean up the text content of the review
                text_content = text_content.text.strip()
                if '|' in text_content:
                    text_content = text_content.split('|')[1].strip()
                
                # Locate the review ratings table for specific feature ratings
                review_ratings = comment.find('table', class_='review-ratings')
                review_ratings = review_ratings.find_all('tr')
                
                # Dictionary to store extracted ratings data
                table_data = {}
                for row in review_ratings:
                    # Extract the header and value cells
                    header_cell = row.find('td', class_='review-rating-header')
                    value_cell = row.find('td', class_='review-value')
                    value2_cell = row.find('td', class_='review-rating-stars')
                    
                    # If both header and either value cell exist, proceed to extract
                    if header_cell and (value_cell or value2_cell):
                        # Retrieve the feature label from the cell class
                        class_name = header_cell['class'][1]
                        data_label = class_to_label.get(class_name, '')
                        
                        # Process the rating value for each feature
                        if value_cell:
                            value = value_cell.text.strip()
                            # For 'Route' values, split into origin and destination
                            if data_label == 'Route':
                                if 'to' in value:
                                    origin, destination = value.split(' to ')
                                elif '-' in value:
                                    origin, destination, _ = value.split('-')
                                table_data['Origin'] = origin.strip()
                                table_data['Destination'] = destination.strip()
                            else:
                                table_data[data_label] = value
                        else:
                            # If using stars, count filled stars
                            filled_star_spans = value2_cell.find_all('span', class_='star fill')
                            table_data[data_label] = int(len(filled_star_spans))

                # Add the extracted data to the comments data list
                comments_data_list.append({
                    'Date Published': date_published,
                    'Overall Rating': rating,
                    'Passenger Country': country,
                    'Trip Verified': verification,
                    'Comment Title': text_header,
                    'Comment': text_content,
                    **table_data
                })
                
            except Exception as e:
                print(f"Error in processing comment at URL section '{url[60:62]}', Comment Index: {comments.index(comment)}")
                traceback.print_exc()


Error in processing comment at URL section 'ge', Comment Index: 18


Traceback (most recent call last):
  File "/var/folders/tr/cyhtsz016w388n4xcfp_x8w40000gn/T/ipykernel_40138/92741421.py", line 69, in <module>
    origin, destination, _ = value.split('-')
ValueError: too many values to unpack (expected 3)


In [9]:
comments_data = pd.DataFrame(comments_data_list)

In [10]:
comments_data.to_csv('Aerlingus_reviews.csv', encoding='utf-8')