In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [4]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"../../BA_reviews.csv", index_col=0)

In [5]:
df.head(10)

Unnamed: 0,reviews,stars,date,country
0,Not Verified | My wife and I are very disappo...,5.0,13th September 2024,United States
1,Not Verified | We flew BA between Heathrow an...,1.0,13th September 2024,Australia
2,Not Verified | Absolutely disgusted with BA. ...,8.0,13th September 2024,United Kingdom
3,Not Verified | Took a trip to Nashville with m...,1.0,11th September 2024,United Kingdom
4,Not Verified | A nightmare journey courtesy o...,8.0,8th September 2024,United Kingdom
5,✅ Trip Verified | Absolutely atrocious. LHR-OR...,1.0,6th September 2024,United Kingdom
6,✅ Trip Verified | As someone who flies relentl...,1.0,2nd September 2024,United Kingdom
7,✅ Trip Verified | Flew with British Airways ...,4.0,1st September 2024,United Kingdom
8,✅ Trip Verified | Straightforward check in T...,2.0,30th August 2024,United Kingdom
9,Not Verified | I am beyond upset and disgusted...,8.0,28th August 2024,United Kingdom


In [6]:
#We will also create a column which mentions if the user is verified or not.

df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0       False
1       False
2       False
3       False
4       False
        ...  
3856    False
3857    False
3858    False
3859    False
3860    False
Name: verified, Length: 3861, dtype: bool

Cleaning Reviews
We will extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [8]:
#for lemmatization of words we will use nltk library
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...


In [9]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [10]:
df.head(10)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | My wife and I are very disappo...,5.0,13th September 2024,United States,False,verified wife disappointed flying british airw...
1,Not Verified | We flew BA between Heathrow an...,1.0,13th September 2024,Australia,False,verified flew ba heathrow berlin one way conne...
2,Not Verified | Absolutely disgusted with BA. ...,8.0,13th September 2024,United Kingdom,False,verified absolutely disgusted ba flight cancel...
3,Not Verified | Took a trip to Nashville with m...,1.0,11th September 2024,United Kingdom,False,verified took trip nashville wife leisure brea...
4,Not Verified | A nightmare journey courtesy o...,8.0,8th September 2024,United Kingdom,False,verified nightmare journey courtesy british ai...
5,✅ Trip Verified | Absolutely atrocious. LHR-OR...,1.0,6th September 2024,United Kingdom,True,absolutely atrocious lhr ord lhr round trip br...
6,✅ Trip Verified | As someone who flies relentl...,1.0,2nd September 2024,United Kingdom,True,someone fly relentlessly british airway busine...
7,✅ Trip Verified | Flew with British Airways ...,4.0,1st September 2024,United Kingdom,True,flew british airway club europe saturday st au...
8,✅ Trip Verified | Straightforward check in T...,2.0,30th August 2024,United Kingdom,True,straightforward check new site club check work...
9,Not Verified | I am beyond upset and disgusted...,8.0,28th August 2024,United Kingdom,False,verified beyond upset disgusted disregard lack...


In [11]:
#Cleaning/Fromat date
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [13]:
import re
import pandas as pd

# Function to remove ordinal suffixes
def clean_date(date_str):
    # Remove 'st', 'nd', 'rd', 'th' from the day part of the date
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

# Apply the cleaning function to the date column
df['date'] = df['date'].apply(clean_date)

# Convert the cleaned date column to datetime format
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

# Check the result
df.head()


Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | My wife and I are very disappo...,5.0,2024-09-13,United States,False,verified wife disappointed flying british airw...
1,Not Verified | We flew BA between Heathrow an...,1.0,2024-09-13,Australia,False,verified flew ba heathrow berlin one way conne...
2,Not Verified | Absolutely disgusted with BA. ...,8.0,2024-09-13,United Kingdom,False,verified absolutely disgusted ba flight cancel...
3,Not Verified | Took a trip to Nashville with m...,1.0,2024-09-11,United Kingdom,False,verified took trip nashville wife leisure brea...
4,Not Verified | A nightmare journey courtesy o...,8.0,2024-09-08,United Kingdom,False,verified nightmare journey courtesy british ai...


In [14]:
df.date.head()

0   2024-09-13
1   2024-09-13
2   2024-09-13
3   2024-09-11
4   2024-09-08
Name: date, dtype: datetime64[ns]

In [15]:
#Cleaning ratings with stars
#check for unique values
df.stars.unique()

array([ 5.,  1.,  8.,  4.,  2.,  9., 10.,  3.,  6.,  7., nan])

In [17]:
# remove the \t and \n from the ratings
# Convert the 'stars' column to string and then strip unwanted characters
df['stars'] = df['stars'].astype(str).str.strip("\n\t")

# Check if it worked
df.head()


Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | My wife and I are very disappo...,5.0,2024-09-13,United States,False,verified wife disappointed flying british airw...
1,Not Verified | We flew BA between Heathrow an...,1.0,2024-09-13,Australia,False,verified flew ba heathrow berlin one way conne...
2,Not Verified | Absolutely disgusted with BA. ...,8.0,2024-09-13,United Kingdom,False,verified absolutely disgusted ba flight cancel...
3,Not Verified | Took a trip to Nashville with m...,1.0,2024-09-11,United Kingdom,False,verified took trip nashville wife leisure brea...
4,Not Verified | A nightmare journey courtesy o...,8.0,2024-09-08,United Kingdom,False,verified nightmare journey courtesy british ai...


In [18]:
df.stars.value_counts()

stars
1.0     921
2.0     438
3.0     425
8.0     381
10.0    330
9.0     318
7.0     315
5.0     276
4.0     254
6.0     198
nan       5
Name: count, dtype: int64

In [19]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [20]:
#check the unique values again
df.stars.unique()

array(['5.0', '1.0', '8.0', '4.0', '2.0', '9.0', '10.0', '3.0', '6.0',
       '7.0', 'nan'], dtype=object)

In [21]:
#Check for null Values
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3859
                       True     False     False        2
Name: count, dtype: int64

In [22]:
df.country.isnull().value_counts()

country
False    3859
True        2
Name: count, dtype: int64

In [23]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [24]:
df.shape


(3859, 6)

In [25]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | My wife and I are very disappo...,5.0,2024-09-13,United States,False,verified wife disappointed flying british airw...
1,Not Verified | We flew BA between Heathrow an...,1.0,2024-09-13,Australia,False,verified flew ba heathrow berlin one way conne...
2,Not Verified | Absolutely disgusted with BA. ...,8.0,2024-09-13,United Kingdom,False,verified absolutely disgusted ba flight cancel...
3,Not Verified | Took a trip to Nashville with m...,1.0,2024-09-11,United Kingdom,False,verified took trip nashville wife leisure brea...
4,Not Verified | A nightmare journey courtesy o...,8.0,2024-09-08,United Kingdom,False,verified nightmare journey courtesy british ai...
...,...,...,...,...,...,...
3854,LHR-JFK-LAX-LHR. Check in was ok apart from be...,3.0,2012-08-29,United Kingdom,False,lhr jfk lax lhr check ok apart snapped early c...
3855,LHR to HAM. Purser addresses all club passenge...,9.0,2012-08-28,United Kingdom,False,lhr ham purser address club passenger name boa...
3856,My son who had worked for British Airways urge...,8.0,2011-10-12,United Kingdom,False,son worked british airway urged fly british ai...
3857,London City-New York JFK via Shannon on A318 b...,2.0,2011-10-11,United States,False,london city new york jfk via shannon really ni...


In [26]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")