<div style="background-color: #1a1a2e; padding: 15px; border-radius: 20px; border: 1px solid #16213e; max-width: 700px; margin: auto; text-align: center;">
    <h2 style="font-family: 'Helvetica Neue', Arial, sans-serif; color: #e94560; font-size: 32px; margin-bottom: 12px; font-weight: bold;">
        Data Collection
    </h2>
    <p style="font-family: 'Roboto', sans-serif; color: #c4c4c4; font-size: 20px; margin-top: 10px; text-align: center; ">
          Web Scraping using Beautiful Soup 
    </p>
</div>

#### Scraping data from Skytrax
#### Link: https://www.airlinequality.com/airline-reviews/emirates

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os 

In [2]:
#!pip install beautifulsoup4 requests

In [10]:
#create an empty list to collect reviews, rating stars, collect date and reviewer's country.

reviews  = []
stars    = []
date     = []
country  = []

In [11]:
base_url = "https://www.airlinequality.com/airline-reviews/emirates"
pages = 200
page_size = 10

for i in range(1, pages + 1):
    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    # Collect HTML data from this page
    page = requests.get(url)
    
    if page.status_code != 200:
        print(f"Failed to retrieve page {i}")
        continue

    # Parse content
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extract reviews
    for item in soup.find_all("div", class_="text_content"):
        reviews.append(item.text.strip())

    # Extract star ratings
    for item in soup.find_all("div", class_="rating-10"):
        try:
            stars.append(item.find("span").text.strip())
        except AttributeError:
            print(f"Error on page {i}: Unable to extract star rating")
            stars.append("None")

    # Extract dates
    for item in soup.find_all("time"):
        date.append(item.text.strip())
        
    # Extract countries
    for item in soup.find_all("h3"):
        country_span = item.find("span")
        if country_span:
            country.append(country_span.next_sibling.text.strip(" ()"))
        else:
            print(f"Error on page {i}: Unable to extract country")
            country.append("None")

In [12]:
print(f"Total reviews: {len(reviews)}")
print(f"Total stars: {len(stars)}")
print(f"Total dates: {len(date)}")
print(f"Total countries: {len(country)}")

Total reviews: 2000
Total stars: 2200
Total dates: 2000
Total countries: 2000


In [13]:
stars = stars[:2000]

In [17]:
#create  a dataframe from these collected lists of data
df = pd.DataFrame({"reviews":reviews,"stars": stars, "date":date, "country": country})
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | I flew a return trip DXB-L...,5,12th August 2024,United Arab Emirates
1,✅ Trip Verified | We were 2 people travellin...,10,11th August 2024,India
2,"Not Verified | First time flying Emirates, w...",4,29th July 2024,Netherlands
3,✅ Trip Verified | Ground Staff and Service Fa...,5,18th July 2024,Singapore
4,Not Verified | The staff are friendly and the...,1,13th July 2024,Hong Kong


In [18]:
df.shape

(2000, 4)

In [25]:
# Storing reviews into a csv format
df.to_csv("Emirates_reviews.csv",index=False)