## Web Scraping

In [None]:
# necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep

In [None]:
# dataframe to store the data
columns = ['Title',
 'Author',
 'Country',
 'Date',
 'Verified',
 'Comment',
 'Type Of Traveller',
 'Seat Type',
 'Route',
 'Date Flown',
 'Seat Comfort',
 'Cabin Staff Service',
 'Food & Beverages',
 'Inflight Entertainment',
 'Ground Service',
 'Wifi & Connectivity',
 'Value For Money',
 'Recommended']

df = pd.DataFrame(columns=columns)
df

In [None]:
# function for getting data from one post
def get_comment(soup):
    data = {} # storing in one dictionary
    # Title of the Comment
    data["Title"] = soup.find('h2', class_='text_header').text
    # The Name of the commenter 
    data["Author"] = soup.find('span', attrs={'itemprop': "name"}).text
    
    # The Country of the Commenter
    # it's written in one element (need to scrape it without getting the text of child elements)
    # also [1:-1] -> because country was written in parathesis
    country = soup.find("h3", class_='userStatusWrapper')
    data["Country"] = "".join(country.find_all(string=True, recursive=False)).strip()[1:-1]
    
    # Date of the comment (taken from datetime attribute of the time element)
    data["Date"] = soup.find('time')['datetime']

    # extracting the comment
    text_content = soup.find('div', class_='text_content')
    # checking whether the comment is verified or not
    try:
        if text_content.find('strong').text.strip() == 'Trip Verified':
            data["Verified"] = True
        else:
            data["Verified"] = False
    except:
        data['Verified'] = 'Not Specified'
    # the comment itself same as country (without getting the text of the child element)
    data["Comment"] = "".join(text_content.find_all(string=True, recursive=False))
    data["Comment"] = data["Comment"][data['Comment'].find("|")+1:].strip()
    
    # getting the review stats from the table
    # adding to the dictionary one by one
    review_stats = soup.find('div', class_='review-stats')
    review_stats = review_stats.find_all('tr')
    for i in review_stats:
        try:
            data[i.find('td', class_='review-rating-header').text] = i.find('td', class_='review-value').text
        except: 
            try:
                star_num = 0
                tds = i.find('td', class_='review-rating-stars')
                tds = tds.find_all('span')
                for td in tds:
                    if 'fill' in td['class']:
                        star_num +=1
                data[i.find('td', class_='review-rating-header').text] = star_num
            except:
                data[i.find('td', class_='review-rating-header').text] = None
    return data

In [None]:
# accessing the given url about british airways
driver = webdriver.Chrome()
page_num = 355

while(True):
    url = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/".format(page_num)
    driver.get(url)
    sleep(2)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html')

    num_of_reviews = int(soup.find('div', class_='pagination-total').text.split(" ")[-2])
    comments = soup.find_all('div', class_='body')
    for com in comments:
        data = get_comment(com)
        df.loc[len(df.index)] = data
    if num_of_reviews <= len(df.index):
        break
    page_num+=1
df

In [None]:
df.drop_duplicates(inplace=True)

## Roberta Sentiment Analysis

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
def sentiment(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # # TF
    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
    # model.save_pretrained(MODEL)
    # text = "Covid cases are increasing fast!"
    # encoded_input = tokenizer(text, return_tensors='tf')
    # output = model(encoded_input)
    # scores = output[0][0].numpy()
    # scores = softmax(scores)
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    result = {}
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        result[l] = s
    return result

In [None]:
df['positive'] = ""
df['neutral'] = ""
df['negative'] = ""

for index, row in df.iterrows():
    try:
        results = sentiment(row['Comment'])
        df.at[index, 'positive'] = results['positive']
        df.at[index, 'neutral'] = results['neutral']
        df.at[index, 'negative'] = results['negative']
    except:
        print("-------------\n"+ index +" -> too long\n ------------")
        continue
    print(index)

In [None]:
df.to_excel("british_airways_sentiment.xlsx")

# Topic Modeling (BERTopic)

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap import UMAP

In [None]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(f'There are {len(stopwords)} default stopwords. They are {stopwords}')

In [None]:
# Remove stopwords
df['review_without_stopwords'] = df['Comment'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))

# Lemmatization
df['review_lemmatized'] = df['review_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

df.reset_index(inplace=True)
# Take a look at the data
df.head()

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, nr_topics=15)

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df['review_lemmatized'])

In [None]:
# Get the list of topics
topic_model.get_topic_info()

In [None]:
# Get the topic predictions
topic_prediction = topic_model.topics_[:]

# Save the predictions in the dataframe
df['topic_prediction'] = topic_prediction

# Take a look at the data
df.head()

In [None]:
df_topic = topic_model.get_topic_info()
for index, row in df_topic.iterrows():
    df_topic.at[index, 'NameClear'] = "-".join(row['Name'].split('_')[1:])
df_topic

In [None]:
merged = df.merge(df_topic, how='left', left_on='topic_prediction', right_on="Topic")
merged

In [None]:
merged_final = merged[merged['Topic'] != -1]
merged_final

In [None]:
merged_final.to_excel("british_airways.xlsx")

Editing dataset

In [1]:
import pandas as pd
df = pd.read_excel("british_airways.xlsx")
df

Unnamed: 0,Title,Author,Country,Date,Verified,Comment,Type Of Traveller,Seat Type,Route,Date Flown,...,Ground Service,Wifi & Connectivity,Value For Money,Recommended,positive,neutral,negative,Topic,Representative_Docs,NameClear
0,"""Total garbage""",Cosmin Stefanescu,Romania,2023-06-16,False,"I flew with numerous airlines, but I gotta adm...",Solo Leisure,Economy Class,Bucharest to Dallas via London,June 2023,...,1.0,1.0,1,no,0.007509,0.064647,0.927844,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
1,"""so callous and uncaring""",Jamie Gooding,Australia,2023-06-12,True,Flight at 8.40am from DUB to LCY cancelled 115...,Solo Leisure,Economy Class,Dublin to London City,June 2023,...,1.0,,1,no,0.008636,0.141682,0.849682,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
2,"""never fly British Airway ever again""",George W Edmonds,United States,2023-06-11,True,The customer service is ugly. Tried calling tw...,Solo Leisure,Economy Class,San Francisco to London,June 2023,...,1.0,,1,no,0.005227,0.056151,0.938622,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
3,"""I will never fly British Airways again""",K Pickering,United States,2023-06-08,True,I booked our first out of country trip to take...,Couple Leisure,Economy Class,Seattle to Nice via London,June 2023,...,1.0,,2,no,,,,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
4,"""customer service was horrendous""",Benjamin Stevens,United States,2023-06-06,False,"May 3, flew from Seattle to Heathrow then to E...",Business,Economy Class,Seattle to Edinburgh via Heathrow,May 2023,...,2.0,1.0,2,no,0.015913,0.278675,0.705413,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,British Airways customer review,Gerard Ward,United Kingdom,2014-05-18,Not Specified,1st Class LHR-PHX. Totally seamless flight. Gr...,,First Class,,,...,,,4,yes,0.981441,0.015133,0.003426,2,['BA got everything right. Allowed evening che...,good-flight-crew-excellent
1607,British Airways customer review,Steve Adolfo,United Kingdom,2014-05-17,Not Specified,Travelling as a family of 4 (2 adults and 2 yo...,,Economy Class,,,...,,,1,no,0.090073,0.700780,0.209147,1,['itinerary supposed Las Vegas-Chicago-London-...,flight-ba-customer-told
1608,British Airways customer review,D Leston,United Kingdom,2014-05-15,Not Specified,MIA-LHR in World Traveller on a 747-400. After...,,Economy Class,,,...,,,4,yes,0.843742,0.131517,0.024741,2,['BA got everything right. Allowed evening che...,good-flight-crew-excellent
1609,British Airways customer review,R Vincent,United Kingdom,2014-05-15,Not Specified,Flew to Vegas on 8th May and dismayed at the s...,,Economy Class,,,...,,,1,no,0.008728,0.136417,0.854855,0,"[""London Hong Kong premium economy. flying BA ...",class-seat-flight-ba


In [None]:
df['Sentiment Result'] = ""
for index, row in df:
    if df['positive'] >= df['neutral']:
        if df['positive'] >= df['negative']:
            df.at[index, 'Sentiment Result'] = 'Positive'
        else:
            df.at[index, 'Sentiment Result'] = 'Negative'
    else:
        if df['neutral'] >= df['negative']:
            df.at[index, 'Sentiment Result'] = 'Neutral'
        else:
            df.at[index, 'Sentiment Result'] = 'Negative'    