### Import necessary libraries

In [69]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import os


### Download necessary NLTK data files


In [70]:
venv_path = os.path.join(os.environ['VIRTUAL_ENV'], 'nltk_data')
nltk.data.path.append(venv_path)


In [81]:
nltk.download('stopwords', download_dir=venv_path)
nltk.download('punkt_tab', download_dir=venv_path)
nltk.download('wordnet', download_dir=venv_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/apple/PROJECTS/github/skytrax-
[nltk_data]     reviews/skytrax_venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/apple/PROJECTS/github/skytrax-
[nltk_data]     reviews/skytrax_venv/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/apple/PROJECTS/github/skytrax-
[nltk_data]     reviews/skytrax_venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Access NLTK components directly through nltk

In [82]:
stopwords = nltk.corpus.stopwords.words('english')
word_tokenize = nltk.word_tokenize
lemmatizer = nltk.WordNetLemmatizer()

### Load dataset

In [73]:
df = pd.read_csv("../data/raw_data/british_airways_raw_reviews.csv")


In [74]:
df.head()

Unnamed: 0,title,author,date,content,type_of_traveller,seat_type,route,date_flown,rating,recommended
0,"""spent two hours trying to make contact with BA""",Paul Mercer,2024-10-25,✅ Trip Verified | On arriving at Mexico Ai...,Business,Business Class,Mexico City to London Heathrow,October 2024,1,no
1,"""using another airline for future travel""",M Stansfield,2024-10-24,✅ Trip Verified | I have flown British Air...,Solo Leisure,Business Class,Paris to Boston via London,July 2024,1,no
2,"""oversold tickets on our flight""",Claude Cahn,2024-10-22,Not Verified | We bought tickets for a Geneva...,Family Leisure,Economy Class,Geneva to London,September 2024,1,no
3,“Appalling service”,Peter Mountford,2024-10-14,✅ Trip Verified | Appalling service with f...,Business,Business Class,Johannesburg to London,October 2024,1,no
4,“BA’s petty penny pinching ”,Paul Mercer,2024-10-12,✅ Trip Verified | British Airways charge you...,Business,Business Class,London to Mexico City,October 2024,6,yes


### Remove unnecessary text from the 'content' column

In [75]:
def clean_text(text):
    # Remove both '✅ Trip Verified |' and 'Not Verified |' at the start of the text
    text = re.sub(r"(✅\s*Trip\s*Verified\s*\|\s*|Not\s*Verified\s*\|\s*)", "", text, flags=re.IGNORECASE)
    # Remove any remaining punctuation and extra whitespace
    text = re.sub(r"[^\w\s]", " ", text)  # Keep only word characters and whitespace
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    return text.strip()


### Make a copy to rpeserve the main data frame

In [76]:
df_clean = df.copy()

### Apply the cleaning function to the 'content' column


In [77]:
df_clean['content_cleaned'] = df['content'].apply(clean_text)

In [78]:
df_clean[['content', 'content_cleaned']].head()


Unnamed: 0,content,content_cleaned
0,✅ Trip Verified | On arriving at Mexico Ai...,On arriving at Mexico Airport we were told tha...
1,✅ Trip Verified | I have flown British Air...,I have flown British Airways for many years an...
2,Not Verified | We bought tickets for a Geneva...,We bought tickets for a Geneva London flight b...
3,✅ Trip Verified | Appalling service with f...,Appalling service with failing defective fleet...
4,✅ Trip Verified | British Airways charge you...,British Airways charge you for the pleasure of...


### Tokenize the text, remove stopwords, and lemmatize words

In [79]:
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenize and lowercase
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]  # Lemmatize & remove stopwords
    return " ".join(words)

### preprocess the 'content' column


In [83]:
df_clean['content_preprocessed'] = df_clean['content_cleaned'].apply(preprocess_text)

In [88]:
df_clean[['content_cleaned', 'content_preprocessed']].sample(2)

Unnamed: 0,content_cleaned,content_preprocessed
635,I used British Airways for the first time and ...,used british airway first time admit impressed...
252,My family flew from Washington to London on a ...,family flew washington london british airway a...
