### Import necessary libraries

In [20]:
import pandas as pd
import re
import nltk
import os
from loguru import logger

### Configure Loguru logger

In [21]:
logger.add(
    "../logs/data_cleaning.log",
    rotation="5 MB",
    retention="10 days",
    level="INFO",
    enqueue=True,
    backtrace=True,
    diagnose=True
)


3

### Download necessary NLTK data files


In [2]:
global_nltk_path = os.path.expanduser('~/nltk_data')
nltk.data.path.append(global_nltk_path)


In [3]:
nltk.download('stopwords', download_dir=global_nltk_path)
nltk.download('punkt_tab', download_dir=global_nltk_path)
nltk.download('wordnet', download_dir=global_nltk_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aliassaad/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aliassaad/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aliassaad/nltk_data...


True

### Access NLTK components directly through nltk

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
word_tokenize = nltk.word_tokenize
lemmatizer = nltk.WordNetLemmatizer()

### Load dataset

In [4]:
df = pd.read_csv("../data/raw/british_airways_raw_reviews.csv")


In [5]:
df.head()

Unnamed: 0,title,author,date,content,type_of_traveller,seat_type,route,date_flown,rating,recommended
0,"""Who can trust BA to travel2",J C Albrecht,2024-10-31,Not Verified | The flight scheduled at 1840 ...,Solo Leisure,Economy Class,London to Istanbul,October 2024,1,no
1,"""just another poor airline""",Dennis Teifeld,2024-10-31,✅ Trip Verified | I have been flying BA fo...,Couple Leisure,Business Class,San Francisco to Barcelona via London,October 2024,5,no
2,"""spent two hours trying to make contact with BA""",Paul Mercer,2024-10-25,✅ Trip Verified | On arriving at Mexico Ai...,Business,Business Class,Mexico City to London Heathrow,October 2024,1,no
3,"""using another airline for future travel""",M Stansfield,2024-10-24,✅ Trip Verified | I have flown British Air...,Solo Leisure,Business Class,Paris to Boston via London,July 2024,1,no
4,"""oversold tickets on our flight""",Claude Cahn,2024-10-22,Not Verified | We bought tickets for a Geneva...,Family Leisure,Economy Class,Geneva to London,September 2024,1,no


### Remove unnecessary text from the 'content' column

In [6]:
def clean_text(text):
    # Remove both '✅ Trip Verified |' and 'Not Verified |' at the start of the text
    text = re.sub(r"(✅\s*Trip\s*Verified\s*\|\s*|Not\s*Verified\s*\|\s*)", "", text, flags=re.IGNORECASE)
    # Remove any remaining punctuation and extra whitespace
    text = re.sub(r"[^\w\s]", " ", text)  # Keep only word characters and whitespace
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    return text.strip()


### Make a copy to rpeserve the main data frame

In [7]:
df_clean = df.copy()

### Apply the cleaning function to the 'content' column


In [8]:
df_clean['content_cleaned'] = df['content'].apply(clean_text)

In [9]:
df_clean[['content', 'content_cleaned']].head()


Unnamed: 0,content,content_cleaned
0,Not Verified | The flight scheduled at 1840 ...,The flight scheduled at 1840 left 2hours 40 mi...
1,✅ Trip Verified | I have been flying BA fo...,I have been flying BA for over 15 years I was ...
2,✅ Trip Verified | On arriving at Mexico Ai...,On arriving at Mexico Airport we were told tha...
3,✅ Trip Verified | I have flown British Air...,I have flown British Airways for many years an...
4,Not Verified | We bought tickets for a Geneva...,We bought tickets for a Geneva London flight b...


### Tokenize the text, remove stopwords, and lemmatize words

In [10]:
def process_text(text):
    """
    Processes the cleaned text by tokenizing, removing stopwords, and lemmatizing.
    Returns both the processed text string and the list of tokens.
    """
    try:
        words = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
        processed_text = " ".join(tokens)
        return processed_text, tokens
    except Exception as e:
        logger.error(f"Error processing text: {e}")
        return "", []


### preprocess the 'content' column


In [12]:
df_clean[['content_processed', 'tokens']] = df_clean['content_cleaned'].apply(
            lambda x: pd.Series(process_text(x))
        )

In [13]:
df_clean[['content_cleaned', 'content_processed']].sample(2)

Unnamed: 0,content_cleaned,content_processed
736,Dubai to London on 5th December Flight was ok ...,dubai london 5th december flight ok seat comfo...
1176,Gatwick to Fort Lauderdale Charging to choose ...,gatwick fort lauderdale charging choose seat e...


In [14]:
df_clean.columns

Index(['title', 'author', 'date', 'content', 'type_of_traveller', 'seat_type',
       'route', 'date_flown', 'rating', 'recommended', 'content_cleaned',
       'content_processed', 'tokens'],
      dtype='object')

In [15]:
initial_count = len(df_clean)
df_clean.dropna(subset=['content_processed', 'tokens'], inplace=True)
final_count = len(df_clean)
logger.info(f"Dropped {initial_count - final_count} reviews due to missing processed content.")

[32m2024-11-01 22:57:36.850[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mDropped 0 reviews due to missing processed content.[0m


In [None]:
# df_clean.drop(columns=['content', 'content_cleaned'], inplace=True)

### Convert tokens list to space-separated string for CSV compatibility

In [16]:
df_clean['tokens_str'] = df_clean['tokens'].apply(lambda x: ' '.join(x))      

### Save the processed data

In [17]:
# Ensure the directory exists
output_directory = "../data/processed"
os.makedirs(output_directory, exist_ok=True)

df_clean[['content_processed', 'tokens_str']].to_csv(f"{output_directory}/british_airways_processed_reviews.csv", index=False)