#**Import Libraries**

In [15]:
import pandas as pd                     # Data Manipulation and Analysis
import requests                         # Web Scraping and HTTP Requests
from bs4 import BeautifulSoup           # Web Scraping
import matplotlib.pyplot as plt         # Data Visualization
import plotly.graph_objects as go       # Data Visualization
import plotly.express as px             # Data Visualization
import re                               # Text Processing
import nltk                             # Text Processing
from nltk.corpus import stopwords       # Text Processing
from nltk.tokenize import word_tokenize # Text Processing
from nltk.stem import WordNetLemmatizer # Text Processing
from nltk import ngrams                 # Text Processing
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
                                        # Text Processing
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
                                        # Text Processing
from collections import Counter         # Text Processing

!pip install vaderSentiment             # Install vaderSentiment library
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
                                        # Sentiment Analysis

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#**Data Collection**

In [16]:
# Set the base URL for scraping airline reviews
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"

# Set the number of pages to scrape
pages = 10

# Set the page size (number of reviews per page)
page_size = 350

# Create empty lists and DataFrame to store the scraped data
reviews = []
aircraft = []
seat_type = []
route = []
recommended = []
df = pd.DataFrame()

# Scrape the reviews from each page
for i in range(1, pages + 1):
    rating = []
    category = []

    print(f"Scraping page {i}")

    # Build the URL for the current page
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Send a GET request to the URL and parse the content with BeautifulSoup
    response = requests.get(url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')

    # Extract the reviews from the parsed content
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    # Extract the ratings, recommendation, and categories from the parsed content
    for para2 in parsed_content.find_all("div", {"class" : "review-stats"}):
        for para3 in para2.find_all('td',{'class' : 'review-value'}):
            rating.append(para3.get_text())

        # Get the recommendation value and remove it from the rating list
        recomend = rating[-1]
        rating = rating[:-1]

        # Extract the star ratings for each category
        for para4 in para2.find_all('td',{'class' : 'review-rating-stars stars'}):
            para5 = len(para4.find_all('span', {'class' : 'star fill'}))
            rating.append(para5)

        # Append the recommendation value to the rating list
        rating.append(recomend)

        # Extract the category names
        for para6 in para2.find_all('td',{'class' : 'review-rating-header'}):
            category.append(para6.get_text())

        # Create a dictionary from the ratings and categories and convert it to a DataFrame
        data_dict = pd.DataFrame([rating], columns=category)
        df = pd.concat([df, data_dict], ignore_index=True).reset_index(drop=True)

        rating = []
        category = []

    print(f"   ---> {len(reviews)} total reviews")

# Add the reviews to the DataFrame
df["reviews"] = reviews
df.head()

Scraping page 1
   ---> 350 total reviews
Scraping page 2
   ---> 700 total reviews
Scraping page 3
   ---> 1050 total reviews
Scraping page 4
   ---> 1400 total reviews
Scraping page 5
   ---> 1750 total reviews
Scraping page 6
   ---> 2100 total reviews
Scraping page 7
   ---> 2450 total reviews
Scraping page 8
   ---> 2800 total reviews
Scraping page 9
   ---> 3150 total reviews
Scraping page 10
   ---> 3500 total reviews


Unnamed: 0,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Aircraft,reviews
0,Solo Leisure,Economy Class,Bucharest to Dallas via London,June 2023,2.0,4.0,3.0,3.0,1.0,1.0,1,no,,"Not Verified | I flew with numerous airlines, ..."
1,Family Leisure,Economy Class,Gatwick to Venice,June 2023,3.0,5.0,4.0,,1.0,,3,no,,✅ Trip Verified | We were traveling as a fami...
2,Solo Leisure,Economy Class,Dublin to London City,June 2023,1.0,1.0,,,1.0,,1,no,,✅ Trip Verified | Flight at 8.40am from DUB to...
3,Business,Business Class,Tokyo to Manchester via Heathrow,June 2023,1.0,1.0,1.0,1.0,1.0,1.0,1,no,,✅ Trip Verified | Terrible. I have traveled t...
4,Solo Leisure,Economy Class,San Francisco to London,June 2023,,,,,1.0,,1,no,,✅ Trip Verified | The customer service is ugl...


#**Data Cleaning**

In [17]:
# Remove any rows with missing reviews
df = df.dropna(subset=['reviews'])

# Split the verification status and reviews into two different columns
df[['Verification Status','Reviews']] = df.reviews.str.split('|',expand = True)

# Remove the original reviews column
df = df.drop(columns=['reviews'])

# Remove tick signs, leading and trailing spaces from the verification status
df['Verification Status'] = df['Verification Status'].str.replace(r'✅','')
df['Verification Status'] = df['Verification Status'].str.replace(r'❎','')
df['Verification Status'] = df['Verification Status'].str.strip()

# Define a preprocessing function
def preprocess_text(text):
    if text is None:
        return ''

    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply the preprocessing function to the 'Reviews' column
df['Preprocessed_Reviews'] = df['Reviews'].apply(preprocess_text)
df.head()

Unnamed: 0,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Aircraft,Verification Status,Reviews,Preprocessed_Reviews
0,Solo Leisure,Economy Class,Bucharest to Dallas via London,June 2023,2.0,4.0,3.0,3.0,1.0,1.0,1,no,,Not Verified,"I flew with numerous airlines, but I gotta ad...",flew numerous airline got ta admit british air...
1,Family Leisure,Economy Class,Gatwick to Venice,June 2023,3.0,5.0,4.0,,1.0,,3,no,,Trip Verified,We were traveling as a family (5 people). Be...,traveling family people accident airport arriv...
2,Solo Leisure,Economy Class,Dublin to London City,June 2023,1.0,1.0,,,1.0,,1,no,,Trip Verified,Flight at 8.40am from DUB to LCY cancelled 11...,flight dub lcy cancelled pm night text message...
3,Business,Business Class,Tokyo to Manchester via Heathrow,June 2023,1.0,1.0,1.0,1.0,1.0,1.0,1,no,,Trip Verified,Terrible. I have traveled twice with them th...,terrible traveled twice year via business clas...
4,Solo Leisure,Economy Class,San Francisco to London,June 2023,,,,,1.0,,1,no,,Trip Verified,The customer service is ugly. Tried calling ...,customer service ugly tried calling two week a...


#**Sentiment Analysis**

In [18]:
# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Iterate over the reviews column and perform sentiment analysis
for index, row in df.iterrows():
    text = row['Reviews']  # Get the text from the 'Reviews' column

    if text is None:
        df.drop(index, inplace=True)  # Remove the row if the text is None
        continue

    # Analyze sentiment using VADER SentimentIntensityAnalyzer
    scores = analyzer.polarity_scores(text)

    # Interpret the sentiment scores
    compound_score = scores['compound']
    negative_score = scores['neg']
    neutral_score = scores['neu']
    positive_score = scores['pos']

    # Determine the sentiment based on the compound score
    if compound_score > 0:
        sentiment = 'Positive'
    elif compound_score < 0:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'

    # Update the sentiment column in the DataFrame
    df.at[index, 'Sentiment'] = sentiment
df.head()

Unnamed: 0,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Aircraft,Verification Status,Reviews,Preprocessed_Reviews,Sentiment
0,Solo Leisure,Economy Class,Bucharest to Dallas via London,June 2023,2.0,4.0,3.0,3.0,1.0,1.0,1,no,,Not Verified,"I flew with numerous airlines, but I gotta ad...",flew numerous airline got ta admit british air...,Positive
1,Family Leisure,Economy Class,Gatwick to Venice,June 2023,3.0,5.0,4.0,,1.0,,3,no,,Trip Verified,We were traveling as a family (5 people). Be...,traveling family people accident airport arriv...,Positive
2,Solo Leisure,Economy Class,Dublin to London City,June 2023,1.0,1.0,,,1.0,,1,no,,Trip Verified,Flight at 8.40am from DUB to LCY cancelled 11...,flight dub lcy cancelled pm night text message...,Negative
3,Business,Business Class,Tokyo to Manchester via Heathrow,June 2023,1.0,1.0,1.0,1.0,1.0,1.0,1,no,,Trip Verified,Terrible. I have traveled twice with them th...,terrible traveled twice year via business clas...,Negative
4,Solo Leisure,Economy Class,San Francisco to London,June 2023,,,,,1.0,,1,no,,Trip Verified,The customer service is ugly. Tried calling ...,customer service ugly tried calling two week a...,Negative


# **Visualization**

In [19]:
import plotly.graph_objects as go
import pandas as pd

# Assuming you have the data in a pandas DataFrame called 'df'

# Count the number of occurrences for each sentiment category
sentiment_counts = df['Sentiment'].value_counts()

# Define a color palette suitable for color blindness
color_palette = ['#0074D9', '#FF4136', '#7FDBFF']

# Create a Pie trace with the specified color palette
pie_trace = go.Pie(
    labels=sentiment_counts.index,
    values=sentiment_counts.values,
    marker=dict(colors=color_palette)
)

# Create a layout
layout = go.Layout(
    title={'text': 'Sentiment Distribution', 'x': 0.5},
    legend={'x': 0.9, 'y': 0.5, 'orientation': 'v'}
)

# Create a Figure object
figure = go.Figure(data=[pie_trace], layout=layout)

# Display the figure
figure.show()

In [20]:
import plotly.graph_objects as go
import pandas as pd

# Assuming you have the data in a pandas DataFrame called 'df'

# Convert the 'Date Flown' column to datetime type
df['Date Flown'] = pd.to_datetime(df['Date Flown'])

# Group the data by date and sentiment
grouped_data = df.groupby([df['Date Flown'].dt.date, 'Sentiment']).size().unstack(fill_value=0)

# Define the color palette
color_palette = {'Negative': '#FF4136', 'Positive': '#0074D9', 'Neutral': '#7FDBFF'}

# Create line plots for each sentiment category
fig = go.Figure()

for sentiment in grouped_data.columns:
    fig.add_trace(go.Scatter(
        x=grouped_data.index,
        y=grouped_data[sentiment],
        name=sentiment,
        mode='lines+markers',
        line=dict(width=2, color=color_palette.get(sentiment, 'black')),
        marker=dict(size=8)
    ))

# Create a layout
layout = go.Layout(
    title={'text': 'Sentiment Distribution over Time', 'x': 0.5, 'xanchor': 'center'},
    xaxis=dict(title='Year'),
    yaxis=dict(title='Count'),
    legend={'x': 0.02, 'y': 0.98},
    plot_bgcolor='rgba(0,0,0,0)'
)

# Update the figure layout
fig.update_layout(layout)

# Display the figure
fig.show()

In [25]:
# Select the 'Preprocessed_Reviews' column from the DataFrame
preprocessed_reviews = df['Preprocessed_Reviews']

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment of n-grams
sentiment_ngrams = []
for review in preprocessed_reviews:
    # Tokenize the preprocessed review
    tokens = nltk.word_tokenize(review)

    # Generate n-grams
    n = 2  # Set the desired n-gram length
    grams = ngrams(tokens, n)

    # Analyze sentiment of each n-gram
    for gram in grams:
        gram_text = ' '.join(gram)
        sentiment = sia.polarity_scores(gram_text)['compound']
        if sentiment < 0:
            sentiment_ngrams.append((gram_text, 'Negative'))

# Count the occurrences of each negative sentiment-bearing n-gram
ngram_counts = Counter(sentiment_ngrams)

# Get the top 15 most common negative sentiment-bearing n-grams
top_15_ngrams = ngram_counts.most_common(15)

# Extract the n-grams and their counts
ngram_labels = [' '.join(ngram[0]).replace('Negative', '') for ngram in top_15_ngrams]
ngram_counts = [ngram[1] for ngram in top_15_ngrams]

# Create a bar trace
bar_trace = go.Bar(
    x=ngram_labels,
    y=ngram_counts,
    marker=dict(color='#0074D9')
)

# Create a layout
layout = go.Layout(
    title=dict(text='Top 15 Negative Reviews Co-occurring Word Pairs', x=0.5),  # Set the 'x' value to 0.5 for center alignment
    xaxis=dict(
        title='Word Pairs',
        tickangle=45,  # Rotate the x-axis labels by -45 degrees
        automargin=True  # Automatically adjust the margins to fit the labels
    ),
    yaxis=dict(title='Count')
)

# Create a Figure object
figure = go.Figure(data=[bar_trace], layout=layout)

# Display the figure
figure.show()
