In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of British Airways reviews on Skytrax
BASE_URL = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/"

# Number of pages to scrape (adjust as needed)
NUM_PAGES = 5

# List to store scraped data
reviews_list = []

# Function to scrape reviews
def scrape_reviews():
    for page in range(1, NUM_PAGES + 1):
        url = BASE_URL.format(page)
        print(f"Scraping page {page}...")

        # Simulate a real user request
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Skipping...")
            continue

        # Parse HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all review containers
        reviews = soup.find_all("article", class_="comp_reviews-review")

        for review in reviews:
            try:
                title = review.find("span", itemprop="author").text.strip()
                rating = review.find("div", class_="reviewRating").get("class")[1].split("-")[-1]  # Extract rating
                date = review.find("time", itemprop="datePublished").text.strip()
                content = review.find("div", class_="text_content").text.strip()

                # Store in list
                reviews_list.append([title, rating, date, content])

            except AttributeError:
                continue  # Skip if any field is missing

        # Delay to avoid bot detection
        time.sleep(2)

# Run scraping function
scrape_reviews()

# Save data to CSV
import os

# Create the "data" directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

# Save data to CSV
df = pd.DataFrame(reviews_list, columns=["Title", "Rating", "Date", "Review"])
df.to_csv("data/british_airways_reviews.csv", index=False)
print("Scraping complete! Data saved to 'data/british_airways_reviews.csv'.")



Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping complete! Data saved to 'data/british_airways_reviews.csv'.


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Load original raw data
df = pd.read_csv("data/british_airways_reviews.csv")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df["Cleaned_Review"] = df["Review"].apply(clean_text)

# Drop empty reviews just in case
df = df[df["Cleaned_Review"].str.strip().astype(bool)]

# Save cleaned file
df.to_csv("data/cleaned_british_airways_reviews.csv", index=False)
print("✅ Cleaned reviews ready!")


✅ Cleaned reviews ready!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download VADER lexicon
nltk.download("vader_lexicon")

# Load cleaned dataset
df = pd.read_csv("data/cleaned_british_airways_reviews.csv")

# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply sentiment analysis to the cleaned reviews
df["Sentiment_Score"] = df["Cleaned_Review"].apply(lambda x: sid.polarity_scores(x)["compound"])

# Categorize sentiment
def get_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df["Sentiment_Label"] = df["Sentiment_Score"].apply(get_sentiment)

# Save the result
df.to_csv("data/sentiment_british_airways_reviews.csv", index=False)
print("Sentiment analysis complete! Results saved to 'data/sentiment_british_airways_reviews.csv'.")

Sentiment analysis complete! Results saved to 'data/sentiment_british_airways_reviews.csv'.


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load cleaned dataset
df = pd.read_csv("data/cleaned_british_airways_reviews.csv")

# Combine all reviews into one big string
text = " ".join(df["Cleaned_Review"].dropna().astype(str).tolist())

# Create the word cloud
wordcloud = WordCloud(width=1000, height=600, background_color='white', colormap='viridis').generate(text)

# Display it
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Most Frequent Words in British Airways Reviews", fontsize=16)
plt.show()


ValueError: We need at least 1 word to plot a word cloud, got 0.

In [None]:
df = pd.read_csv("data/cleaned_british_airways_reviews.csv")
print(df["Cleaned_Review"].head(10))
print("Total non-empty reviews:", df["Cleaned_Review"].dropna().str.strip().astype(bool).sum())


Series([], Name: Cleaned_Review, dtype: object)
Total non-empty reviews: 0


In [None]:
import pandas as pd

df_raw = pd.read_csv("data/british_airways_reviews.csv")
print("Sample raw reviews:\n", df_raw["Review"].dropna().head(5))
print("Total reviews:", len(df_raw))
print("Missing review text:", df_raw["Review"].isna().sum())


Sample raw reviews:
 Series([], Name: Review, dtype: object)
Total reviews: 0
Missing review text: 0


In [None]:
import pandas as pd

# Load raw data
df_raw = pd.read_csv("data/british_airways_reviews.csv")

# Inspect raw reviews
print("Sample Raw Reviews:\n", df_raw["Review"].head(10))
print("Missing Reviews:", df_raw["Review"].isna().sum())


Sample Raw Reviews:
 Series([], Name: Review, dtype: object)
Missing Reviews: 0
