In [1]:
# Import Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# Function to extract review details
# Purpose: To extract specific details (date, header, rating, content, etc.) from the `review` object
# `review` is an HTML element from the web page
def extract_review(review):
    review_dict = {}

    # Extract the date
    try:
        review_dict["date"] = review.find("meta").get("content")
    except AttributeError:
        review_dict["date"] = None

    # Extract the header
    header = review.find("h2", {"class": "text_header"})
    review_dict["header"] = header.get_text() if header else None

    # Extract the rating
    rating = review.find("div", {"class": "rating-10"})
    review_dict["rating"] = None  
    if rating:
        span = rating.find("span")
        if span:
            review_dict["rating"] = span.get_text()

    # Extract the content
    content = review.find("div", {"class": "text_content"})
    review_dict["content"] = content.get_text().strip() if content else None

    # Extract additional ratings from the table
    table = review.find("table", {"class": "review-ratings"})
    if table:
        data = table.find_all("td")

        # Separate the category names (keys) and their ratings (values)
        keys = data[::2]
        values = data[1::2]

        for key, value in zip(keys, values):
            key_text = key.get_text()
            star_value = None

            try:
                star_value = value.find_all("span", {"class": "star fill"})[-1]
            except IndexError:
                pass

            if star_value:
                review_dict[key_text] = star_value.get_text()
            else:
                review_dict[key_text] = value.get_text() if value else None

    return review_dict


In [3]:
import re

# Function to get total number of pages
def get_total_pages(url):
    response = requests.get(url)
    content = BeautifulSoup(response.content, "html.parser")
    
    # Find the total number of pages by looking for a page navigation element
    pagination = content.find("div", {"class": "pagination-total"})
    if pagination:
        total_pages_text = pagination.get_text(strip=True)
        # Extract only numbers from the text using regular expressions
        total_pages = re.findall(r'\d+', total_pages_text)
        if total_pages:
            return int(total_pages[-1])  # Return the last number (total pages)
    return 1

In [4]:
# Function to scrape all pages
def get_all_reviews(airline_url):
    # List to store all reviews
    reviews_list = []
    total_pages = get_total_pages(airline_url)  # Get the total number of pages
    print(f"Total pages found: {total_pages}")

    for page in range(1, total_pages + 1):
        # Request each page's URL
        response = requests.get(f"{airline_url}page/{page}/")
        content = BeautifulSoup(response.content, "html.parser")
        
        # Find all review articles
        content_reviews = content.find_all("article", class_=lambda value: value and value.startswith("review-"))

        if not content_reviews:  # Stop if no more reviews
            break

        # Loop through reviews and extract data, add it to the list
        for review in content_reviews:
            review_data = extract_review(review)
            reviews_list.append(review_data)

        print(f"Scraped page {page} of {total_pages}")

    # Convert list of dictionaries to pandas DataFrame
    reviews_df = pd.DataFrame(reviews_list)

    # Remove rows where header or rating is missing, to improve quality of data
    reviews_df = reviews_df.dropna(subset=['header', 'rating'])

    return reviews_df


In [5]:
# Define the URL for Air New Zealand reviews
airline_url = "https://www.airlinequality.com/airline-reviews/air-new-zealand/"

# Scrape all reviews across pages
reviews_df = get_all_reviews(airline_url)

# Display the DataFrame
print(reviews_df)

Total pages found: 868
Scraped page 1 of 868
Scraped page 2 of 868
Scraped page 3 of 868
Scraped page 4 of 868
Scraped page 5 of 868
Scraped page 6 of 868
Scraped page 7 of 868
Scraped page 8 of 868
Scraped page 9 of 868
Scraped page 10 of 868
Scraped page 11 of 868
Scraped page 12 of 868
Scraped page 13 of 868
Scraped page 14 of 868
Scraped page 15 of 868
Scraped page 16 of 868
Scraped page 17 of 868
Scraped page 18 of 868
Scraped page 19 of 868
Scraped page 20 of 868
Scraped page 21 of 868
Scraped page 22 of 868
Scraped page 23 of 868
Scraped page 24 of 868
Scraped page 25 of 868
Scraped page 26 of 868
Scraped page 27 of 868
Scraped page 28 of 868
Scraped page 29 of 868
Scraped page 30 of 868
Scraped page 31 of 868
Scraped page 32 of 868
Scraped page 33 of 868
Scraped page 34 of 868
Scraped page 35 of 868
Scraped page 36 of 868
Scraped page 37 of 868
Scraped page 38 of 868
Scraped page 39 of 868
Scraped page 40 of 868
Scraped page 41 of 868
Scraped page 42 of 868
Scraped page 43 of 8

In [6]:
# Save the scraped reviews to a CSV file
reviews_df.to_csv("../dataset/air_new_zealand_reviews.csv", index=False)
reviews_df

Unnamed: 0,date,header,rating,content,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Aircraft
0,2024-09-15,"""Service was so good""",10,Not Verified | Such a lovely experience! Serv...,Business,Economy Class,Melbourne to Tokyo,August 2024,5,5,5,5,5,5,5,yes,
1,2024-09-12,“a highly disappointing experience”,4,Not Verified | I recently had a highly disappo...,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3,4,3,3,1,4,1,no,
2,2024-08-29,“the legroom was huge”,9,Not Verified | I was honestly quite surprised ...,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5,5,5,,5,4,5,yes,A320
3,2024-08-19,"""getting worse by the day""",3,Not Verified | Air New Zealand domestic servi...,Business,Economy Class,Auckland to Napier,August 2024,4,5,,,5,,1,no,
4,2024-08-13,"""never fly Air New Zealand again""",1,✅ Trip Verified | Air New Zealand bumped us ...,Family Leisure,Economy Class,Napier to Auckland,June 2024,3,4,,,1,,1,no,ATR-72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,2013-05-08,Air New Zealand customer review,9,Flew Premium Economy from LAX to AKL on a 777-...,,Premium Economy,,,4,5,5,5,,,5,yes,
949,2013-05-08,Air New Zealand customer review,3,Leg room on Internal flights is getting smalle...,,Economy Class,,,1,4,3,,,,3,no,
950,2013-05-06,Air New Zealand customer review,5,Adelaide to Auckland April 28 2013. Usual incr...,,Economy Class,,,3,3,4,1,,,3,no,
951,2013-05-01,Air New Zealand customer review,5,B747-400 Premium Economy SFO-AKL B767 Works AK...,,Premium Economy,,,3,3,4,3,,,2,no,


In [7]:
# Load the scraped CSV file
airnz_reviews = "../dataset/air_new_zealand_reviews.csv"
reviews_df = pd.read_csv(airnz_reviews)

reviews_df

Unnamed: 0,date,header,rating,content,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Wifi & Connectivity,Value For Money,Recommended,Aircraft
0,2024-09-15,"""Service was so good""",10,Not Verified | Such a lovely experience! Serv...,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,
1,2024-09-12,“a highly disappointing experience”,4,Not Verified | I recently had a highly disappo...,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,no,
2,2024-08-29,“the legroom was huge”,9,Not Verified | I was honestly quite surprised ...,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,yes,A320
3,2024-08-19,"""getting worse by the day""",3,Not Verified | Air New Zealand domestic servi...,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,no,
4,2024-08-13,"""never fly Air New Zealand again""",1,✅ Trip Verified | Air New Zealand bumped us ...,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,no,ATR-72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,2013-05-08,Air New Zealand customer review,9,Flew Premium Economy from LAX to AKL on a 777-...,,Premium Economy,,,4.0,5.0,5.0,5.0,,,5.0,yes,
836,2013-05-08,Air New Zealand customer review,3,Leg room on Internal flights is getting smalle...,,Economy Class,,,1.0,4.0,3.0,,,,3.0,no,
837,2013-05-06,Air New Zealand customer review,5,Adelaide to Auckland April 28 2013. Usual incr...,,Economy Class,,,3.0,3.0,4.0,1.0,,,3.0,no,
838,2013-05-01,Air New Zealand customer review,5,B747-400 Premium Economy SFO-AKL B767 Works AK...,,Premium Economy,,,3.0,3.0,4.0,3.0,,,2.0,no,


In [8]:
# Format headers: convert to lowercase and replace spaces with underscores
new_columns = {}

for col in reviews_df.columns:
    # Convert the column name to lowercase
    clean_col = col.lower()
    
    # Replace any spaces with underscores
    clean_col = re.sub(r"\s+", '_', clean_col)
    
    # Add the cleaned column name to the new_columns dictionary
    new_columns[col] = clean_col

# Use the dictionary to rename the columns in the dataframe
reviews_df = reviews_df.rename(columns=new_columns)

reviews_df

Unnamed: 0,date,header,rating,content,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft
0,2024-09-15,"""Service was so good""",10,Not Verified | Such a lovely experience! Serv...,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,
1,2024-09-12,“a highly disappointing experience”,4,Not Verified | I recently had a highly disappo...,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,no,
2,2024-08-29,“the legroom was huge”,9,Not Verified | I was honestly quite surprised ...,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,yes,A320
3,2024-08-19,"""getting worse by the day""",3,Not Verified | Air New Zealand domestic servi...,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,no,
4,2024-08-13,"""never fly Air New Zealand again""",1,✅ Trip Verified | Air New Zealand bumped us ...,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,no,ATR-72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,2013-05-08,Air New Zealand customer review,9,Flew Premium Economy from LAX to AKL on a 777-...,,Premium Economy,,,4.0,5.0,5.0,5.0,,,5.0,yes,
836,2013-05-08,Air New Zealand customer review,3,Leg room on Internal flights is getting smalle...,,Economy Class,,,1.0,4.0,3.0,,,,3.0,no,
837,2013-05-06,Air New Zealand customer review,5,Adelaide to Auckland April 28 2013. Usual incr...,,Economy Class,,,3.0,3.0,4.0,1.0,,,3.0,no,
838,2013-05-01,Air New Zealand customer review,5,B747-400 Premium Economy SFO-AKL B767 Works AK...,,Premium Economy,,,3.0,3.0,4.0,3.0,,,2.0,no,


In [9]:
# Check for duplicates using 'header' or 'date'
# Remove duplicates based on 'header' and 'date' to ensure uniqueness
reviews_df = reviews_df.drop_duplicates(subset=['header', 'date'])

reviews_df

Unnamed: 0,date,header,rating,content,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft
0,2024-09-15,"""Service was so good""",10,Not Verified | Such a lovely experience! Serv...,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,
1,2024-09-12,“a highly disappointing experience”,4,Not Verified | I recently had a highly disappo...,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,no,
2,2024-08-29,“the legroom was huge”,9,Not Verified | I was honestly quite surprised ...,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,yes,A320
3,2024-08-19,"""getting worse by the day""",3,Not Verified | Air New Zealand domestic servi...,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,no,
4,2024-08-13,"""never fly Air New Zealand again""",1,✅ Trip Verified | Air New Zealand bumped us ...,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,no,ATR-72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,2013-05-12,Air New Zealand customer review,3,I have flown with Air New Zealand for several ...,,Economy Class,,,3.0,2.0,3.0,4.0,,,2.0,no,
833,2013-05-10,Air New Zealand customer review,10,Flew Wellington-San Francisco (via Auckland). ...,,Premium Economy,,,5.0,5.0,5.0,5.0,,,4.0,yes,
835,2013-05-08,Air New Zealand customer review,9,Flew Premium Economy from LAX to AKL on a 777-...,,Premium Economy,,,4.0,5.0,5.0,5.0,,,5.0,yes,
837,2013-05-06,Air New Zealand customer review,5,Adelaide to Auckland April 28 2013. Usual incr...,,Economy Class,,,3.0,3.0,4.0,1.0,,,3.0,no,


In [10]:
# Function to safely split the 'content' column into 'verified_status' and 'review_content'
def split_content(row):
    # Check if '|' is in the content to avoid IndexError
    if '|' in row['content']:
        parts = row['content'].split('|')
        # Check if 'Trip Verified' is present in the first part of the split
        if 'Trip Verified' in parts[0]:
            return 'Verified', parts[1].strip()  # Verified status and review content
        else:
            return 'Not Verified', parts[1].strip()  # Not Verified status and review content
    else:
        # If no '|' is found, assume the entire content is the review content, without verification status
        return 'Unknown', row['content'].strip()

In [11]:
# Apply the function to split 'content' into two new columns
reviews_df[['verified_status', 'review_content']] = reviews_df.apply(lambda row: split_content(row), axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df[['verified_status', 'review_content']] = reviews_df.apply(lambda row: split_content(row), axis=1, result_type='expand')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df[['verified_status', 'review_content']] = reviews_df.apply(lambda row: split_content(row), axis=1, result_type='expand')


In [12]:
# Drop the original 'content' column if no longer needed
reviews_df = reviews_df.drop(columns=['content'])

In [13]:
reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,"""Service was so good""",10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,,Not Verified,Such a lovely experience! Service was so good ...
1,2024-09-12,“a highly disappointing experience”,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,no,,Not Verified,I recently had a highly disappointing experien...
2,2024-08-29,“the legroom was huge”,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,yes,A320,Not Verified,I was honestly quite surprised about Air New Z...
3,2024-08-19,"""getting worse by the day""",3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,no,,Not Verified,Air New Zealand domestic service is getting wo...
4,2024-08-13,"""never fly Air New Zealand again""",1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,no,ATR-72,Verified,Air New Zealand bumped us off our flight on th...


In [14]:
# Convert 'verified_status' column to boolean based on whether it contains "verified"
reviews_df["verified_status"] = reviews_df["verified_status"].str.lower() == "verified"

reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,"""Service was so good""",10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,yes,,False,Such a lovely experience! Service was so good ...
1,2024-09-12,“a highly disappointing experience”,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,no,,False,I recently had a highly disappointing experien...
2,2024-08-29,“the legroom was huge”,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,yes,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,"""getting worse by the day""",3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,no,,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,"""never fly Air New Zealand again""",1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,no,ATR-72,True,Air New Zealand bumped us off our flight on th...


In [15]:
# Convert 'recommended' column to boolean based on whether it contains "yes"
reviews_df["recommended"] = reviews_df["recommended"].str.lower() == "yes"

reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,"""Service was so good""",10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,True,,False,Such a lovely experience! Service was so good ...
1,2024-09-12,“a highly disappointing experience”,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,False,,False,I recently had a highly disappointing experien...
2,2024-08-29,“the legroom was huge”,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,True,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,"""getting worse by the day""",3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,False,,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,"""never fly Air New Zealand again""",1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,False,ATR-72,True,Air New Zealand bumped us off our flight on th...


In [16]:
# Strip both straight and curly quotes from the 'header' column
reviews_df["header"] = reviews_df["header"].str.strip('"“”')

reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,Service was so good,10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,True,,False,Such a lovely experience! Service was so good ...
1,2024-09-12,a highly disappointing experience,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,False,,False,I recently had a highly disappointing experien...
2,2024-08-29,the legroom was huge,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,,5.0,4.0,5.0,True,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,getting worse by the day,3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,,,5.0,,1.0,False,,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,never fly Air New Zealand again,1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,,,1.0,,1.0,False,ATR-72,True,Air New Zealand bumped us off our flight on th...


In [17]:
# Ensure reviews_df is the DataFrame
# Identify the numeric columns
numeric_columns = reviews_df.select_dtypes(include=['float64', 'int64']).columns

# Replace missing values (NaN) in numeric columns with -1
reviews_df[numeric_columns] = reviews_df[numeric_columns].fillna(-1)


In [18]:
reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,Service was so good,10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,True,,False,Such a lovely experience! Service was so good ...
1,2024-09-12,a highly disappointing experience,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,False,,False,I recently had a highly disappointing experien...
2,2024-08-29,the legroom was huge,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,-1.0,5.0,4.0,5.0,True,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,getting worse by the day,3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,-1.0,-1.0,5.0,-1.0,1.0,False,,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,never fly Air New Zealand again,1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,-1.0,-1.0,1.0,-1.0,1.0,False,ATR-72,True,Air New Zealand bumped us off our flight on th...


In [19]:
# Identify the categorical columns
categorical_columns = reviews_df.select_dtypes(include=['object']).columns

# Strip leading and trailing spaces from all categorical columns
reviews_df[categorical_columns] = reviews_df[categorical_columns].apply(lambda x: x.str.strip())

reviews_df[categorical_columns] = reviews_df[categorical_columns].fillna("Not informed")


In [20]:
reviews_df.head()

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,Service was so good,10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,True,Not informed,False,Such a lovely experience! Service was so good ...
1,2024-09-12,a highly disappointing experience,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,False,Not informed,False,I recently had a highly disappointing experien...
2,2024-08-29,the legroom was huge,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,-1.0,5.0,4.0,5.0,True,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,getting worse by the day,3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,-1.0,-1.0,5.0,-1.0,1.0,False,Not informed,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,never fly Air New Zealand again,1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,-1.0,-1.0,1.0,-1.0,1.0,False,ATR-72,True,Air New Zealand bumped us off our flight on th...


In [21]:
# Save the cleaned reviews to a CSV file
reviews_df.to_csv("../dataset/air_nz_cleaned_data.csv", index=False)
reviews_df

Unnamed: 0,date,header,rating,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_&_beverages,inflight_entertainment,ground_service,wifi_&_connectivity,value_for_money,recommended,aircraft,verified_status,review_content
0,2024-09-15,Service was so good,10,Business,Economy Class,Melbourne to Tokyo,August 2024,5.0,5.0,5.0,5.0,5.0,5.0,5.0,True,Not informed,False,Such a lovely experience! Service was so good ...
1,2024-09-12,a highly disappointing experience,4,Solo Leisure,Economy Class,Raratonga to Auckland via Taipei,August 2024,3.0,4.0,3.0,3.0,1.0,4.0,1.0,False,Not informed,False,I recently had a highly disappointing experien...
2,2024-08-29,the legroom was huge,9,Family Leisure,Economy Class,Auckland to Queenstown,December 2023,5.0,5.0,5.0,-1.0,5.0,4.0,5.0,True,A320,False,I was honestly quite surprised about Air New Z...
3,2024-08-19,getting worse by the day,3,Business,Economy Class,Auckland to Napier,August 2024,4.0,5.0,-1.0,-1.0,5.0,-1.0,1.0,False,Not informed,False,Air New Zealand domestic service is getting wo...
4,2024-08-13,never fly Air New Zealand again,1,Family Leisure,Economy Class,Napier to Auckland,June 2024,3.0,4.0,-1.0,-1.0,1.0,-1.0,1.0,False,ATR-72,True,Air New Zealand bumped us off our flight on th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832,2013-05-12,Air New Zealand customer review,3,Not informed,Economy Class,Not informed,Not informed,3.0,2.0,3.0,4.0,-1.0,-1.0,2.0,False,Not informed,False,I have flown with Air New Zealand for several ...
833,2013-05-10,Air New Zealand customer review,10,Not informed,Premium Economy,Not informed,Not informed,5.0,5.0,5.0,5.0,-1.0,-1.0,4.0,True,Not informed,False,Flew Wellington-San Francisco (via Auckland). ...
835,2013-05-08,Air New Zealand customer review,9,Not informed,Premium Economy,Not informed,Not informed,4.0,5.0,5.0,5.0,-1.0,-1.0,5.0,True,Not informed,False,Flew Premium Economy from LAX to AKL on a 777-...
837,2013-05-06,Air New Zealand customer review,5,Not informed,Economy Class,Not informed,Not informed,3.0,3.0,4.0,1.0,-1.0,-1.0,3.0,False,Not informed,False,Adelaide to Auckland April 28 2013. Usual incr...
