In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv("data/BA_reviews.csv")

In [None]:
data.head(4)

In [3]:
data.drop(columns='Unnamed: 0', inplace=True)

In [None]:
data.head()

In [4]:
# clean the reviews column, validation from the text
data.reviews = data.reviews.str.split("|", expand=True).get(1)

In [None]:
data.head()

##### Preprocess the review column for natural language processing


In [5]:
import nltk 
from nltk.corpus import stopwords
import re 


In [6]:
# download the stopwords 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/ksilas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ksilas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ksilas/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
# text preprocessing function
def preprocess(text):
    # Remove special characters and digits
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = text.lower()  # Lowercase text
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

In [8]:
# apply the text preprocessing function to the reviews dataset
data['processed_reviews'] = data.reviews.apply(preprocess)
print(data['processed_reviews'])

0       [happy, flight, crew, plane, 20, years, stress...
1       [horrible, service, boarding, landing, flew, l...
2       [wife, disappointed, flying, british, airways,...
3       [flew, ba, heathrow, berlin, one, way, connect...
4       [absolutely, disgusted, ba, flights, cancelled...
                              ...                        
1995    [los, angeles, london, heathrow, starting, cha...
1996    [gatwick, barcelona, unimpressed, new, board, ...
1997    [london, heathrow, rio, de, janeiro, 45, minut...
1998    [london, heathrow, new, york, paid, 250, pre, ...
1999    [flew, british, airways, economy, tampa, athen...
Name: processed_reviews, Length: 2000, dtype: object


In [9]:
data['processed_reviews'] = data['processed_reviews'].apply(lambda x: x if isinstance(x, list) else [])

In [None]:
print(data.processed_reviews.apply(type))

In [10]:
token_arrays = data['processed_reviews'].tolist()

#### Create a Dictionary and Corpus
##### create a dictionary and corpus from the preprocessed text data.



In [11]:
from gensim import corpora 

In [12]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(token_arrays)
corpus = [dictionary.doc2bow(review) for review in data['processed_reviews']]

In [None]:
print(dictionary)

#### Train LDA Model

In [13]:
from gensim.models.ldamodel import LdaModel


In [14]:
# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, passes=10, alpha='auto')

In [15]:
# Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}, Words: {topic}")

Topic: 0, Words: 0.024*"flight" + 0.012*"ba" + 0.010*"british" + 0.010*"airways" + 0.009*"london" + 0.007*"service" + 0.007*"us" + 0.007*"get" + 0.007*"customer" + 0.006*"one"
Topic: 1, Words: 0.012*"class" + 0.010*"business" + 0.010*"ba" + 0.008*"flight" + 0.008*"service" + 0.006*"first" + 0.005*"good" + 0.005*"experience" + 0.005*"london" + 0.005*"seat"
Topic: 2, Words: 0.022*"flight" + 0.017*"ba" + 0.009*"service" + 0.009*"food" + 0.008*"crew" + 0.008*"seat" + 0.007*"london" + 0.007*"cabin" + 0.007*"time" + 0.007*"good"


In [None]:
# Get the topic distribution for each document (review)
for i, row in enumerate(lda_model[corpus]):
    print(f"Review {i+1}: {row}")

In [16]:
import pyLDAvis.gensim_models

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus,dictionary=lda_model.id2word)
pyLDAvis.display(vis)


TypeError: drop() takes from 1 to 2 positional arguments but 3 were given