In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [60]:
data = pd.read_csv("data/BA_reviews.csv")

In [61]:
data.head(4)

Unnamed: 0.1,Unnamed: 0,reviews
0,0,Not Verified | Happy with the flight crew. Ha...
1,1,✅ Trip Verified | Horrible service from boar...
2,2,Not Verified | My wife and I are very disappo...
3,3,Not Verified | We flew BA between Heathrow an...


In [62]:
data.drop(columns='Unnamed: 0', inplace=True)

In [63]:
data.head()

Unnamed: 0,reviews
0,Not Verified | Happy with the flight crew. Ha...
1,✅ Trip Verified | Horrible service from boar...
2,Not Verified | My wife and I are very disappo...
3,Not Verified | We flew BA between Heathrow an...
4,Not Verified | Absolutely disgusted with BA. ...


In [64]:
# clean the reviews column, validation from the text
data.reviews = data.reviews.str.split("|", expand=True).get(1)

In [65]:
data.head()

Unnamed: 0,reviews
0,Happy with the flight crew. Hadn't been on a...
1,Horrible service from boarding to landing. ...
2,My wife and I are very disappointed with fly...
3,We flew BA between Heathrow and Berlin one w...
4,Absolutely disgusted with BA. Our flights we...


##### Preprocess the review column for natural language processing


In [19]:
import nltk 
from nltk.corpus import stopwords
import re 


In [66]:
# download the stopwords 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/ksilas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ksilas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ksilas/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [67]:
# text preprocessing function
def preprocess(text):
    # Remove special characters and digits
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = text.lower()  # Lowercase text
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

In [68]:
# apply the text preprocessing function to the reviews dataset
data['processed_reviews'] = data.reviews.apply(preprocess)
print(data['processed_reviews'])

0       [happy, flight, crew, plane, 20, years, stress...
1       [horrible, service, boarding, landing, flew, l...
2       [wife, disappointed, flying, british, airways,...
3       [flew, ba, heathrow, berlin, one, way, connect...
4       [absolutely, disgusted, ba, flights, cancelled...
                              ...                        
1995    [los, angeles, london, heathrow, starting, cha...
1996    [gatwick, barcelona, unimpressed, new, board, ...
1997    [london, heathrow, rio, de, janeiro, 45, minut...
1998    [london, heathrow, new, york, paid, 250, pre, ...
1999    [flew, british, airways, economy, tampa, athen...
Name: processed_reviews, Length: 2000, dtype: object


In [102]:
data['processed_reviews'] = data['processed_reviews'].apply(lambda x: x if isinstance(x, list) else [])

In [103]:
print(data.processed_reviews.apply(type))

0       <class 'list'>
1       <class 'list'>
2       <class 'list'>
3       <class 'list'>
4       <class 'list'>
             ...      
1995    <class 'list'>
1996    <class 'list'>
1997    <class 'list'>
1998    <class 'list'>
1999    <class 'list'>
Name: processed_reviews, Length: 2000, dtype: object


In [104]:
token_arrays = data['processed_reviews'].tolist()

#### Create a Dictionary and Corpus
##### create a dictionary and corpus from the preprocessed text data.



In [34]:
from gensim import corpora 

In [106]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(token_arrays)
corpus = [dictionary.doc2bow(review) for review in data['processed_reviews']]

In [107]:
print(dictionary)

Dictionary<10837 unique tokens: ['20', 'airways', 'back', 'british', 'comfortable']...>


#### Train LDA Model

In [114]:
from gensim.models.ldamodel import LdaModel


In [115]:
# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, passes=10, alpha='auto')

In [121]:
# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}, Words: {topic}")

Topic: 0, Words: 0.024*"flight" + 0.012*"ba" + 0.010*"british" + 0.010*"airways" + 0.009*"london" + 0.007*"service" + 0.007*"us" + 0.007*"get" + 0.007*"customer" + 0.006*"one"
Topic: 1, Words: 0.012*"class" + 0.010*"business" + 0.010*"ba" + 0.008*"flight" + 0.008*"service" + 0.006*"first" + 0.005*"good" + 0.005*"experience" + 0.005*"london" + 0.005*"seat"
Topic: 2, Words: 0.022*"flight" + 0.017*"ba" + 0.009*"service" + 0.009*"food" + 0.008*"crew" + 0.008*"seat" + 0.007*"london" + 0.007*"cabin" + 0.007*"time" + 0.007*"good"


In [117]:
# Get the topic distribution for each document (review)
for i, row in enumerate(lda_model[corpus]):
    print(f"Review {i+1}: {row}")


Review 1: [(0, 0.3668477), (2, 0.6303414)]
Review 2: [(2, 0.9930403)]
Review 3: [(0, 0.9817645), (2, 0.015543697)]
Review 4: [(2, 0.99783635)]
Review 5: [(0, 0.988964)]
Review 6: [(2, 0.9979297)]
Review 7: [(0, 0.73616606), (2, 0.26331264)]
Review 8: [(2, 0.9985653)]
Review 9: [(1, 0.9895244)]
Review 10: [(0, 0.5786351), (2, 0.42042664)]
Review 11: [(2, 0.9959714)]
Review 12: [(0, 0.89174145), (1, 0.10676103)]
Review 13: [(0, 0.7828107), (2, 0.21656607)]
Review 14: [(0, 0.68784416), (2, 0.31001022)]
Review 15: [(2, 0.9941119)]
Review 16: [(0, 0.78141546), (2, 0.21534134)]
Review 17: [(0, 0.9872739), (2, 0.010780097)]
Review 18: [(2, 0.99110115)]
Review 19: [(0, 0.59551877), (1, 0.39818922)]
Review 20: [(1, 0.97577924), (2, 0.02334016)]
Review 21: [(1, 0.89909345), (2, 0.09707926)]
Review 22: [(0, 0.99665445)]
Review 23: [(0, 0.99285775)]
Review 24: [(2, 0.9921935)]
Review 25: [(2, 0.9979082)]
Review 26: [(0, 0.92712826), (2, 0.07213191)]
Review 27: [(0, 0.99404216)]
Review 28: [(0, 0.7

In [127]:
import pyLDAvis.gensim_models

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus,dictionary=dictionary)
pyLDAvis.show(vis, local=False)


TypeError: drop() takes from 1 to 2 positional arguments but 3 were given