# Airlines Insights

---

## Web scraping and analysis

This Jupyter notebook includes some code to get you started with web scraping. We will use a package called `BeautifulSoup` to collect the data from the web. Data was collected and saved it into a local `.csv` file for starting the analysis.

### Scraping data from Skytrax

Visited [https://www.airlinequality.com]. For this task, we are only interested in reviews related to British Airways and the Airline itself.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/alaska-airlines"
pages = 50
page_size = 100

reviews = []
traveller = []
labels_star = []


# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    # Reviews
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
        
    # Travel Stats
    for stats in parsed_content.find_all("div", {"class": "review-stats"}):
        # Extract text content from the stats element
        stats_text = stats.get_text()
    
        # Regular expression patterns for each category
        regex_categories = {'Type Of Traveller': r'Type Of Traveller(.+)',
                            'Seat Type': r'Seat Type(.+)',
                            'Route': r'Route(.+)',
                            'Date Flown': r'Date Flown(.+)',
                            'Seat Comfort': r'Seat Comfort(\d+)',
                            'Cabin Staff Service': r'Cabin Staff Service(\d+)',
                            'Food & Beverages': r'Food & Beverages(\d+)',
                            'Inflight Entertainment': r'Inflight Entertainment(\d+)',
                            'Ground Service': r'Ground Service(\d+)',
                            'Wifi & Connectivity': r'Wifi & Connectivity(\d+)',
                            'Value For Money': r'Value For Money(\d+)',
                            'Recommended': r'Recommended(.+)'
                            }

        # Extract results using regular expressions
        results = {}
        for category, regex_pattern in regex_categories.items():
            match = re.search(regex_pattern, stats_text)
            if match:
                results[category] = match.group(1).strip()
                
        res = {}
        # Append results to the traveller list
        for category, value in results.items():
            res = f"{category}: {value}"
            traveller.append(res)
    
    print(f"   ---> {len(reviews)} total reviews")

In [None]:
data_dicts = []

# Iterate over each set of category-value pairs
for i in range(0, len(traveller), 5):  # Assuming there are 11 categories per review
    data_dict = {}
    for j in range(5):  # 11 categories per review
        index = i + j
        if index < len(traveller):  # Check if the index is within the range of the list
            key, value = traveller[index].split(': ')
            data_dict[key] = value
    data_dicts.append(data_dict)
    
df_traveller = pd.DataFrame(data_dicts)
df_traveller

In [None]:
df = pd.DataFrame()
df["Reviews"] = reviews
df = df.merge(df_traveller, how='left', left_index=True, right_index=True)
df

In [None]:
from datetime import datetime

# Assuming 'df' is your DataFrame containing the data
df.to_csv(f"alaska-airlines/AlaskaAirlinesReviews{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.CSV", index=False)