In [55]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import numpy as np

def convert_date(date_string):
    try:
        # Removing ordinal suffixes
        clean_date = date_string.replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')
        # Attempt to parse the date
        date_object = datetime.strptime(clean_date, '%d %B %Y')
        return date_object.strftime('%d-%m-%Y')
        
    except ValueError:
        # Return NaN if parsing fails
        return np.nan


In [56]:
def airlines_List_Finder():
    url = "https://www.airlinequality.com/review-pages/a-z-airline-reviews/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    airline_links = {}
    
    # Targeting the specific sections that contain airline reviews
    # Assuming that the links we are interested in are not in the 'Latest Airline Reviews' section
    review_sections = soup.find_all('section', id=lambda x: x and x != 'Latest Airline Reviews')

    for section in review_sections:
        for link in section.find_all('a', href=True):
            href = link['href']
            if '/airline-reviews/' in href:
                airline_name = link.get_text().strip()
                airline_links[airline_name] = href

    return airline_links

# Example usage
airline_links = airlines_List_Finder()
for name, link in airline_links.items():
    print(f"{name}: {link}")

In [57]:
def scrape_page(base_url, pages=10):
    all_reviews = []

    for page in range(1, pages + 1):
        url = base_url + str(page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all review containers (assuming each review is within a 'div' with a specific class)
        review_containers = soup.find_all('div', class_='body')  # Update this selector as per actual HTML

        for container in review_containers:
            # Extracting table data for ratings
            table_data = {}
            rows = container.find_all('tr')
            for row in rows:
                header = row.find('td', class_='review-rating-header')
                value = row.find('td', class_='review-value')
                stars = row.find('td', class_='review-rating-stars')

                if header and value:  # Text value
                    table_data[header.get_text(strip=True)] = value.get_text(strip=True)
                elif header and stars:  # Star rating
                    filled_stars = stars.find_all('span', class_='star fill')
                    table_data[header.get_text(strip=True)] = format(len(filled_stars), '.0f')

            # Extracting additional information
            dirty_date = container.find('time', itemprop="datePublished").get_text(strip=True)
            name = container.find('span', itemprop="name").get_text(strip=True)
            dirty_country = container.find('h3').get_text(strip=True)  # Update this selector as per actual HTML
            comment = container.find('h2').get_text(strip=True)  # Update this selector as per actual HTML

            extract_between_parentheses = lambda text: re.search(r'\((.*?)\)', text).group(1) if re.search(r'\((.*?)\)', text) else ''
            country = extract_between_parentheses(dirty_country)
            date = convert_date(dirty_date)
            
            # Newly extracted data
            review_data = {
                'name': name,
                'country': country,
                'date': date,
                'comment': comment
            }

            # Update the table data with the new data
            table_data.update(review_data)
            all_reviews.append(table_data)

    return all_reviews

# Example usage
SA_URL = "https://www.airlinequality.com/airline-reviews/saudi-arabian-airlines/page/"
result = scrape_page(SA_URL)



In [58]:
df1 = pd.DataFrame(result)
df1.head()

Unnamed: 0,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,name,country,date,comment,Aircraft
0,Couple Leisure,Economy Class,Jeddah to Lahore,November 2023,3,2,1,1.0,1,1.0,2,no,Muhammad Irfan Masood,Pakistan,21-11-2023,"“staff was rude, unprofessional and cheap”",
1,Solo Leisure,Economy Class,Addis Ababa to Jeddah,November 2023,4,4,5,,3,,4,yes,L Wilson,United States,21-11-2023,“Really good flight”,
2,Family Leisure,Economy Class,Riyadh to London,September 2023,5,5,5,5.0,5,5.0,5,yes,Dhay Alharbi,Saudi Arabia,19-11-2023,"""It was a great experience""",Boeing 777
3,Solo Leisure,Business Class,Kuwait to Riyadh via Jeddah & Bangkok,October 2023,5,3,2,3.0,3,,5,yes,Anders Pedersen,Denmark,31-10-2023,"""largest SkyTeam lounge worldwide""",A320 / Boeing 787-10
4,Family Leisure,Economy Class,Cairo to Jeddah,October 2023,4,4,1,4.0,1,,2,no,Rashawn Hughes,United States,29-10-2023,"""tried changing our meals to vegan at check in""",


In [60]:
def transform_and_rename_dataframe(dictionary, column_order, new_column_names):
    #transform dict in df
    df1 = pd.DataFrame(result)
    
    # Reorder the columns
    df2 = df1.reindex(columns=column_order)

    # Check if the number of new column names matches the number of columns in df2
    if len(new_column_names) != len(df2.columns):
        raise ValueError("The number of new column names must match the number of columns in the DataFrame")

    # Rename the columns
    df2.columns = new_column_names

    return df2

transform_and_rename_dataframe(result, ['name', 'country', 'date', 'Type Of Traveller', 'Seat Type', 'Route', 'Date Flown', 'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money', 'Recommended', 'comment'], ['Name', 'Country', 'Date', 'Traveller Type', 'Seat Type', 'Route', 'Date Flown', 'Seat Comfort /5', 'Cabin Staff Service /5', 'Food & Beverages /5', 'Inflight Entertainment /5', 'Ground Service /5', 'Wifi & Connectivity /5', 'Value For Money /5', 'Recommended', 'Comment'])


Unnamed: 0,Name,Country,Date,Traveller Type,Seat Type,Route,Date Flown,Seat Comfort /5,Cabin Staff Service /5,Food & Beverages /5,Inflight Entertainment /5,Ground Service /5,Wifi & Connectivity /5,Value For Money /5,Recommended,Comment
0,Muhammad Irfan Masood,Pakistan,21-11-2023,Couple Leisure,Economy Class,Jeddah to Lahore,November 2023,3,2,1,1,1,1,2,no,"“staff was rude, unprofessional and cheap”"
1,L Wilson,United States,21-11-2023,Solo Leisure,Economy Class,Addis Ababa to Jeddah,November 2023,4,4,5,,3,,4,yes,“Really good flight”
2,Dhay Alharbi,Saudi Arabia,19-11-2023,Family Leisure,Economy Class,Riyadh to London,September 2023,5,5,5,5,5,5,5,yes,"""It was a great experience"""
3,Anders Pedersen,Denmark,31-10-2023,Solo Leisure,Business Class,Kuwait to Riyadh via Jeddah & Bangkok,October 2023,5,3,2,3,3,,5,yes,"""largest SkyTeam lounge worldwide"""
4,Rashawn Hughes,United States,29-10-2023,Family Leisure,Economy Class,Cairo to Jeddah,October 2023,4,4,1,4,1,,2,no,"""tried changing our meals to vegan at check in"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Mazher Khan,Saudi Arabia,27-06-2022,Family Leisure,Economy Class,Riyadh to Mumbai,May 2022,3,3,2,2,1,2,2,no,"""aircraft are dated and unkept"""
96,S Diego,Indonesia,12-06-2022,Solo Leisure,Economy Class,Jakarta to Jeddah,June 2022,5,5,1,1,1,,1,no,"""arm rests in coach class are randomly broken"""
97,C Han,Malaysia,07-06-2022,Solo Leisure,Economy Class,Amsterdam to Kuala Lumpur via Jeddah,June 2022,3,3,1,2,1,,5,no,"""almost 2 hours to queue for checkin"""
98,J Way,Malaysia,06-06-2022,Solo Leisure,Economy Class,Amsterdam to Jeddah via Kuala Lumpur,May 2022,1,1,2,1,2,,1,no,"""Very bad service!"""
