In [1]:
import pandas as pd
import numpy as np
import gensim
import spacy
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [2]:
# load data
df = pd.read_csv('data/BA_reviews_sentiments.csv', index_col=0)

# Preprocessing

1. Standardize some BA specific language: e.g. conver 'Club World' to 'Business Class' etc.

1. Tokenize: Convert each string to a list of tokens (words), removing accents and lower casing all words.

2. Remove stopwords: Remove the standard stop words, plus some extra for this particular use case.

3. Lemmatization: Convert each token to its lemma/dictionary form.

4. Rejoin tokens into strings.

In [3]:
# BA-specific standardization

df['reviews_mod'] = df['reviews'].str.lower()\
    .str.replace('world traveller plus', 'premium economy', regex=False)\
    .str.replace('world traveller', 'economy', regex=False)\
    .str.replace('club europe', 'business class', regex=False)\
    .str.replace('club world', 'business class', regex=False)\
    .str.replace('ife', 'entertainment', regex=False)    

In [4]:
# tokenization

def tokenize(string):
    return gensim.utils.simple_preprocess(string, deacc=True)

df['tokens'] = df.reviews_mod.map(tokenize)

In [5]:
# get stopwords

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

# add custom stop words
stop_words.add('british')
stop_words.add('airways')

# remove stop words

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['tokens2'] = df.tokens.map(remove_stopwords)

In [6]:
# convert each token to lemma

def lemmatize(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

df['tokens3'] = df.tokens2.map(lemmatize)

In [7]:
# join tokens together to get a single processed string

df['proc_reviews'] = df['tokens3'].str.join(' ')

# Term-frequency matrix

Use a `CountVectorizer` to get the frequency of each token in each review.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_df = 0.5,
    min_df=10,
    max_features=1000,
    ngram_range=(1,3),
    stop_words='english'
)

tf_matrix = vectorizer.fit_transform(df.proc_reviews)

# Build topic model

## Determine number of topics

We run the LDA model for various number of possible topics and measure the quality of topic assignment by perplexity and log-likelihood. Using these, we determine the optimum number of topics.

In [9]:
def build_lda(num_topics, tf_matrix):
    model = LDA(n_components=num_topics)
    model.fit(tf_matrix)
    return model

def evaluate_lda_model(model, tf_matrix):
    return model.perplexity(tf_matrix), model.score(tf_matrix)

In [10]:
num_topics = range(2,11)
for n in num_topics:
    model = build_lda(n, tf_matrix)
    perp, loglike = evaluate_lda_model(model, tf_matrix)
    print(f'With {n} topics, perplexity is: {perp:.2f} | log-likelihood is {loglike:.2f}.')

With 2 topics, perplexity is: 539.21 | log-likelihood is -1220961.39.
With 3 topics, perplexity is: 532.12 | log-likelihood is -1218392.08.
With 4 topics, perplexity is: 528.28 | log-likelihood is -1216983.09.
With 5 topics, perplexity is: 535.44 | log-likelihood is -1219596.52.
With 6 topics, perplexity is: 526.02 | log-likelihood is -1216153.37.
With 7 topics, perplexity is: 531.98 | log-likelihood is -1218338.34.
With 8 topics, perplexity is: 531.18 | log-likelihood is -1218045.94.
With 9 topics, perplexity is: 535.74 | log-likelihood is -1219707.97.
With 10 topics, perplexity is: 538.81 | log-likelihood is -1220814.70.


## Identify the topics - part 1

We use the `pyLDAvis` tool to visualize the topics, and the words appearing in each topic. This helps us identify the topics.

In [11]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from imp import reload


In [12]:
num_topics = 6

model = build_lda(num_topics, tf_matrix)

perp, loglike = evaluate_lda_model(model, tf_matrix)
print(f'Perplexity is: {perp:.2f} | log-likelihood is {loglike:.2f}.')

Perplexity is: 528.33 | log-likelihood is -1217001.52.


In [13]:
pyLDAvis.sklearn.prepare(model, tf_matrix, vectorizer)

  default_term_info = default_term_info.sort_values(


## Identify topics - part 2

We view a sample of reviews from each topic.

In [28]:
topics_assigned = model.transform(tf_matrix)
df_topics = pd.DataFrame(topics_assigned, columns=range(1,num_topics+1))
df_merged = pd.merge(
    left=df,
    right=df_topics,
    how='left',
    left_index=True,
    right_index=True
)

In [29]:
for topic in range(1, num_topics+1):
    print(f'Topic {topic}:')
    print(f'Count: {df_merged[df_merged[topic]>0.7].shape[0]}')
    print(f'Average sentiment: {df_merged[df_merged[topic]>0.7]["Sentiment"].mean()}')
    print(f'        Lower quartile: {df_merged[df_merged[topic]>0.7]["Sentiment"].quantile(0.25)}')
    print(f'        Upper quartile: {df_merged[df_merged[topic]>0.7]["Sentiment"].quantile(0.75)}')



Topic 1:
Count: 378
Average sentiment: 0.813143210901746
        Lower quartile: 0.9630305375
        Upper quartile: 0.9996266949999999
Topic 2:
Count: 109
Average sentiment: 0.10565033447073394
        Lower quartile: 0.0006546779
        Upper quartile: 0.010647104
Topic 3:
Count: 374
Average sentiment: 0.028485697181871657
        Lower quartile: 0.0003745331025
        Upper quartile: 0.001160410975
Topic 4:
Count: 288
Average sentiment: 0.11012531239281251
        Lower quartile: 0.00047467214249999996
        Upper quartile: 0.00571839545
Topic 5:
Count: 33
Average sentiment: 0.2468601199939394
        Lower quartile: 0.0007156654
        Upper quartile: 0.31908664
Topic 6:
Count: 135
Average sentiment: 0.17506792356103704
        Lower quartile: 0.00067832223
        Upper quartile: 0.024006553


In [32]:
for topic in range(1, num_topics+1):
    print('-'*20+'\nTopic ', topic)
    print('-'*20)
    for x in df_merged[df_merged[topic]>0.5]['reviews'].sample(5):
        print(x)
        print('-'*20)

--------------------
Topic  1
--------------------
A380 to Hong Kong WT+. Wonderful plane slept for a number of hours (which for me is all but unheard of). Food much better than I was expecting considering that WT+ is supposed to get the same food as WT. Plane left on time and arrived early. Crew friendly - I had little call to ask them for much but any request was quickly attended to. Return from Bangkok incoming flight (and hence departure from BKK) delayed to ATC in UK BA provided voucher which was more than enough for a good sized breakfast. 777 nowhere near as nice as the A380 noisy seats less comfortable and I got barely any sleep on this 13h flight. FAs not as attentive as going out but no real complaints. Poor choice of entertainment on very small screen. Overall quite good and fair value for money in WT+.
--------------------
Cape Town to London. I can report a positive experience on our flight. From the moment we boarded, cabin crew were polite and efficient, and provided a h

# Asign topic names

In [27]:
df_merged = df_merged.rename(columns = {1:'positive', 2:'budget_airlines', 3:'customer_service',4:'other_negative',5:'bags_crew_seats',6:'business_class'})

In [22]:
df_merged[['reviews', 'Verified', 'Sentiment','positive', 'budget_airlines',
       'customer_service', 'other_negative', 'bags_crew_seats',
       'business_class']].to_csv('data/BA_reviews_topics.csv')