<div style="background-color: #1a1a2e; padding: 15px; border-radius: 20px; border: 1px solid #16213e; max-width: 700px; margin: auto; text-align: center;">
    <h2 style="font-family: 'Helvetica Neue', Arial, sans-serif; color: #e94560; font-size: 30px; margin-bottom: 12px; font-weight: bold;">
        Data Cleaning
    </h2>
</div>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Darrin
[nltk_data]     DeYoung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Darrin
[nltk_data]     DeYoung\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('Emirates_reviews.csv')
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | I flew a return trip DXB-L...,5,12th August 2024,United Arab Emirates
1,✅ Trip Verified | We were 2 people travellin...,10,11th August 2024,India
2,"Not Verified | First time flying Emirates, w...",4,29th July 2024,Netherlands
3,✅ Trip Verified | Ground Staff and Service Fa...,5,18th July 2024,Singapore
4,Not Verified | The staff are friendly and the...,1,13th July 2024,Hong Kong


In [4]:
df.shape

(2000, 4)

In [5]:
df['status'] = df.reviews.str.contains("Trip Verified")

In [6]:
df.head()

Unnamed: 0,reviews,stars,date,country,status
0,✅ Trip Verified | I flew a return trip DXB-L...,5,12th August 2024,United Arab Emirates,True
1,✅ Trip Verified | We were 2 people travellin...,10,11th August 2024,India,True
2,"Not Verified | First time flying Emirates, w...",4,29th July 2024,Netherlands,False
3,✅ Trip Verified | Ground Staff and Service Fa...,5,18th July 2024,Singapore,True
4,Not Verified | The staff are friendly and the...,1,13th July 2024,Hong Kong,False


In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

reviews_data = df.reviews.str.strip("✅ Trip Verified |")

corpus = []
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]',' ', text)
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    text = " ".join(text)
    corpus.append(text)

In [8]:
# Apply Preprocessing function to the review_data
reviews_data.apply(preprocess_text)

0       None
1       None
2       None
3       None
4       None
        ... 
1995    None
1996    None
1997    None
1998    None
1999    None
Name: reviews, Length: 2000, dtype: object

In [9]:
# add the corpus to the original dataframe
df['corpus'] = corpus

In [10]:
df.head()

Unnamed: 0,reviews,stars,date,country,status,corpus
0,✅ Trip Verified | I flew a return trip DXB-L...,5,12th August 2024,United Arab Emirates,True,flew return trip dxb li dxb used mile upgrade ...
1,✅ Trip Verified | We were 2 people travellin...,10,11th August 2024,India,True,people travelling together dubai boston sector...
2,"Not Verified | First time flying Emirates, w...",4,29th July 2024,Netherlands,False,verified first time flying emirate expecting l...
3,✅ Trip Verified | Ground Staff and Service Fa...,5,18th July 2024,Singapore,True,ground staff service failure flight ek careful...
4,Not Verified | The staff are friendly and the...,1,13th July 2024,Hong Kong,False,verified staff friendly seat comfortable air t...


### Checking Null Values

In [11]:
df.isnull().sum()

reviews    0
stars      0
date       0
country    0
status     0
corpus     0
dtype: int64

### Checking Dtypes

In [12]:
df.dtypes

reviews    object
stars       int64
date       object
country    object
status       bool
corpus     object
dtype: object

#### date column contains dates in a non-standard format like 2nd,3rd,1st..
#### So will remove ordinal suffixes (st, nd, rd, th) from the dates

In [13]:
def remove_ordinal_suffixes(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

df['date'] = df['date'].apply(remove_ordinal_suffixes)

In [14]:
# convert the date to datetime format
df['date'] = pd.to_datetime(df['date'])

In [15]:
df['date'].head(3)

0   2024-08-12
1   2024-08-11
2   2024-07-29
Name: date, dtype: datetime64[ns]

In [16]:
df['stars'].unique()

array([ 5, 10,  4,  1,  2,  8,  3,  9,  7,  6], dtype=int64)

In [17]:
df['stars'].value_counts()

stars
1     407
5     326
10    244
2     203
9     184
3     150
8     148
4     129
7     112
6      97
Name: count, dtype: int64

In [18]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,status,corpus
0,✅ Trip Verified | I flew a return trip DXB-L...,5,2024-08-12,United Arab Emirates,True,flew return trip dxb li dxb used mile upgrade ...
1,✅ Trip Verified | We were 2 people travellin...,10,2024-08-11,India,True,people travelling together dubai boston sector...
2,"Not Verified | First time flying Emirates, w...",4,2024-07-29,Netherlands,False,verified first time flying emirate expecting l...
3,✅ Trip Verified | Ground Staff and Service Fa...,5,2024-07-18,Singapore,True,ground staff service failure flight ek careful...
4,Not Verified | The staff are friendly and the...,1,2024-07-13,Hong Kong,False,verified staff friendly seat comfortable air t...
...,...,...,...,...,...,...
1995,ICN-DXB and DXB-LIS between October 5/6 on eco...,1,2014-10-13,Portugal,False,icn dxb dxb li october economy check smooth si...
1996,Just flew return BNE to DXB on A380 and then o...,1,2014-10-12,Australia,False,flew return bne dxb onto vce b agree many said...
1997,Flew on the EK 38 on September from Birmingham...,4,2014-10-12,United Kingdom,False,flew ek september birmingham dubai first time ...
1998,My flight was a true class act. Flew First Cla...,8,2014-10-12,United States,False,flight true class act flew first class dfw dxb...


In [19]:
# Save processed reviews
df.to_csv("Emirates_processed_reviews.csv", index=None)

### Saving objects

In [20]:
import pickle

In [21]:
pickle.dump(lemmatizer, open('../SavedModels/lemmatizer.pkl', 'wb'))