This notebook contains code used in part 1 of British Airways data analysis job simulation on Forage. 
Libraries utilized:
    - requests 
    - BeautifulSoup 
    - pandas
    - nltk
    - wordcloud

In [None]:
#!/.venv/bin/python3

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS

Grab review content

In [None]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")

In [None]:
# Store in a dataframe and save to disk
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

df.to_csv("data/BA_reviews.csv")

Clean the review strings

In [None]:
raw_df = pd.read_csv("data/BA_reviews.csv")
clean_df = pd.DataFrame()
cleaned_list = []

for raw_review in raw_df.reviews:
    cleaned_list.append(raw_review[raw_review.index('|')+1:])

clean_df['reviews'] = cleaned_list
display(clean_df)

full_text = "".join(review for review in clean_df['reviews'])

Reviews are now ready for basic analysis. First we perform a wordcloud to identify review topics of interest, then we will run sentiment analysis

In [None]:
# Remove stopwords which are likely to be very common in reviews but not particularly enlightening 
wc_stopwords = set(STOPWORDS)
wc_stopwords.update(["ba", "flight", "british", "airway", "london", "airways", "airline", "heathrow"]) # "seat", "seats", "hour"

# Generate wordcloud
cloud = WordCloud(stopwords=wc_stopwords).generate(full_text)
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Download nltk models
nltk.download('all')

In [None]:
# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Helper method, used below
def preprocess_text(text):
    """Tokenizes and lemmatized text passed"""

    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in nltk.corpus.stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# Helper method, used below
def get_sentiment(text):
    """Get compound sentiment, classify as either 1 (positive) or 0 (negative)"""

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['compound'] > 0 else 0
    return sentiment

In [None]:
# Preprocess and analyze reviews
clean_df['pp_reviews'] = clean_df['reviews'].apply(preprocess_text)
clean_df['sentiment'] = clean_df['pp_reviews'].apply(get_sentiment) 
display(clean_df)

# Save to analyzed copy disk
clean_df.to_csv("data/BA_reviews_w_compound.csv")

In [None]:
# Output positive and negative totals
print('{} / 1000 reviews were deemed positive\n{} / 1000 thus were deemed negative'.format(clean_df['sentiment'].sum(), 1000 - clean_df['sentiment'].sum()))

In [None]:
# Further sentiment analysis on keywords identified by wordcloud
wc_keywords = ['service', 'time', 'food', 'seat', 'staff', 'crew', 'return', 'experience', 'luggage', 'cabin', 'hour']
kw_counts = {}

# initialize counts and scores to 0
for keyword in wc_keywords:
    kw_counts[keyword] = { 'count' : 0, 'sum': 0 }

# Look for key words and record score of relevant reviews
for index, row in clean_df.iterrows():

    # Identify reviews with keywords in them
    for keyword in wc_keywords:
        if keyword in row['reviews']:
            kw_counts[keyword]['count'] += 1
            kw_counts[keyword]['sum'] += row['sentiment']

for kw_topic, values in kw_counts.items():
    print('{} score: {}/{}: {:.2%}'.format(kw_topic, values['sum'], values['count'], values['sum'] / values['count']))

Plot keyword sentiments

In [None]:
sorted_by_sent = dict(sorted(kw_counts.items(), key = lambda item: item[1]['sum'] / item[1]['count'], reverse = True))

fig, ax = plt.subplots()
fig.set_figwidth(15)
bar_container = ax.bar(sorted_by_sent.keys(), [x['sum'] / x['count'] for index, x in sorted_by_sent.items()])
ax.set(ylabel='Average Score', title='Review Sentiment Score by Keyword', ylim=(0, 1))
ax.bar_label(bar_container, fmt='{:.2%}')

plt.show()