In [4]:
## Data Cleaning
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#The "os" module allows Python programs to access the file system, 
#perform file operations, and obtain information about the system environment.

#regex
#In Python, import re is a statement that allows you to use the "re" module,
#which provides support for regular expressions (also known as regex).

import re

In [3]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)


In [114]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | This was my first time flyin...,1,28th January 2023,United Kingdom
1,✅ Trip Verified | Lots of cancellations and d...,9,24th January 2023,Canada
2,✅ Trip Verified | BA 242 on the 6/2/23. Boardi...,10,24th January 2023,United Kingdom
3,✅ Trip Verified | Not only my first flight in...,10,23rd January 2023,Spain
4,✅ Trip Verified | My husband and myself were ...,10,21st January 2023,United Kingdom


In [115]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [116]:
df['verified']


0        True
1        True
2        True
3        True
4        True
        ...  
3466    False
3467    False
3468    False
3469    False
3470    False
Name: verified, Length: 3471, dtype: bool

In [5]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

Unnamed: 0,reviews
0,✅ Trip Verified | This was my first time flyin...
1,✅ Trip Verified | Lots of cancellations and d...
2,✅ Trip Verified | BA 242 on the 6/2/23. Boardi...
3,✅ Trip Verified | Not only my first flight in...
4,✅ Trip Verified | My husband and myself were ...


0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: verified, Length: 1000, dtype: bool

[nltk_data] Downloading package omw-1.4 to /Users/efz/nltk_data...


True

In [118]:
# add the corpus to the original dataframe

df['corpus'] = corpus
df.head()


Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | This was my first time flyin...,1,28th January 2023,United Kingdom,True,first time flying ba pleasantly surprised isla...
1,✅ Trip Verified | Lots of cancellations and d...,9,24th January 2023,Canada,True,lot cancellation delay one apologized edinburg...
2,✅ Trip Verified | BA 242 on the 6/2/23. Boardi...,10,24th January 2023,United Kingdom,True,ba boarding delayed due late arrival incoming ...
3,✅ Trip Verified | Not only my first flight in...,10,23rd January 2023,Spain,True,first flight year also first time back england...
4,✅ Trip Verified | My husband and myself were ...,10,21st January 2023,United Kingdom,True,husband flying madrid rd february legal matter...


In [119]:
# Cleaning and Format date
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [121]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)
df.date.head()

0   2023-01-28
1   2023-01-24
2   2023-01-24
3   2023-01-23
4   2023-01-21
Name: date, dtype: datetime64[ns]

In [122]:
# Cleaning ratings with stars
#check for unique values
df.stars.unique()
# remove the \t and \n from the ratings
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")


In [124]:
df.stars.value_counts()

1       748
2       390
3       386
8       350
10      314
7       302
9       299
5       260
4       233
6       184
None      5
Name: stars, dtype: int64

In [125]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)


In [126]:
#check the unique values again
df.stars.unique()

array(['1', '9', '10', '2', '4', '3', '5', '8', '6', '7'], dtype=object)

In [127]:
# Check for Null Values
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3464
                       True     False     False        2
dtype: int64

In [128]:
df.country.isnull().value_counts()

False    3464
True        2
Name: country, dtype: int64

In [129]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [130]:
df.shape

(3464, 6)

In [109]:
df.shape

(1000, 3)

In [131]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | This was my first time flyin...,1,2023-01-28,United Kingdom,True,first time flying ba pleasantly surprised isla...
1,✅ Trip Verified | Lots of cancellations and d...,9,2023-01-24,Canada,True,lot cancellation delay one apologized edinburg...
2,✅ Trip Verified | BA 242 on the 6/2/23. Boardi...,10,2023-01-24,United Kingdom,True,ba boarding delayed due late arrival incoming ...
3,✅ Trip Verified | Not only my first flight in...,10,2023-01-23,Spain,True,first flight year also first time back england...
4,✅ Trip Verified | My husband and myself were ...,10,2023-01-21,United Kingdom,True,husband flying madrid rd february legal matter...
...,...,...,...,...,...,...
3459,Business LHR to BKK. 747-400. First try back w...,7,2022-11-07,United Kingdom,False,business lhr bkk first try back ba year flown ...
3460,This was a bmi Regional operated flight on a R...,1,2022-11-05,United Kingdom,False,bmi regional operated flight rj manchester hea...
3461,LHR-HKG on Boeing 747 - 23/08/12. Much has bee...,10,2022-10-31,United Kingdom,False,lhr hkg boeing much written tired old fleet go...
3462,LHR to HAM. Purser addresses all club passenge...,8,2022-10-31,United Kingdom,False,lhr ham purser address club passenger name boa...


In [132]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")