In [68]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [93]:
# nltk variables

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

extra_stopwords = {'|', '-', 'verified', 'review', 'via', 'flight.', 'flight', ' trip verified | '}
all_stopwords = stop_words.union(extra_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bekezhanissabek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
## All methods

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    text = text.strip()
    return text
    
def remove_stopwords(text):
    words = text.split()
    filtered_text = [word for word in words if word not in all_stopwords]
    return ' '.join(filtered_text)

def remove_special_chars(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)


In [71]:
data = pd.read_excel('../data/raw/capstone_airline_reviews3.xlsx')

In [72]:
data.head()

Unnamed: 0,airline,overall,author,review_date,customer_review,aircraft,traveller_type,cabin,route,date_flown,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,recommended
0,,,,,,,,,,,,,,,,,
1,Turkish Airlines,7.0,Christopher Hackley,8th May 2019,âœ… Trip Verified | London to Izmir via Istanb...,,Business,Economy Class,London to Izmir via Istanbul,2019-05-01 00:00:00,4.0,5.0,4.0,4.0,2.0,4.0,yes
2,,,,,,,,,,,,,,,,,
3,Turkish Airlines,2.0,Adriana Pisoi,7th May 2019,âœ… Trip Verified | Istanbul to Bucharest. We ...,,Family Leisure,Economy Class,Istanbul to Bucharest,2019-05-01 00:00:00,4.0,1.0,1.0,1.0,1.0,1.0,no
4,,,,,,,,,,,,,,,,,


In [73]:
data.shape

(131895, 17)

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131895 entries, 0 to 131894
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          65947 non-null  object 
 1   overall          64017 non-null  float64
 2   author           65947 non-null  object 
 3   review_date      65947 non-null  object 
 4   customer_review  65947 non-null  object 
 5   aircraft         19718 non-null  object 
 6   traveller_type   39755 non-null  object 
 7   cabin            63303 non-null  object 
 8   route            39726 non-null  object 
 9   date_flown       39633 non-null  object 
 10  seat_comfort     60681 non-null  float64
 11  cabin_service    60715 non-null  float64
 12  food_bev         52608 non-null  float64
 13  entertainment    44193 non-null  float64
 14  ground_service   39358 non-null  float64
 15  value_for_money  63975 non-null  float64
 16  recommended      64440 non-null  object 
dtypes: float64

In [75]:
data.describe()

Unnamed: 0,overall,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money
count,64017.0,60681.0,60715.0,52608.0,44193.0,39358.0,63975.0
mean,5.14543,2.95216,3.191814,2.90817,2.863372,2.69282,2.943962
std,3.477532,1.441362,1.565789,1.481893,1.507262,1.612215,1.58737
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,1.0,1.0,1.0,1.0
50%,5.0,3.0,3.0,3.0,3.0,3.0,3.0
75%,9.0,4.0,5.0,4.0,4.0,4.0,4.0
max,10.0,5.0,5.0,5.0,5.0,5.0,5.0


<h2>Data cleaning</h2>

In [76]:
print(f"Count of null data:\n{data.isnull().sum()}")

Count of null data:
airline             65948
overall             67878
author              65948
review_date         65948
customer_review     65948
aircraft           112177
traveller_type      92140
cabin               68592
route               92169
date_flown          92262
seat_comfort        71214
cabin_service       71180
food_bev            79287
entertainment       87702
ground_service      92537
value_for_money     67920
recommended         67455
dtype: int64


In [77]:
print(f"Count of duplicated data: \n{data.duplicated().sum()}")

Count of duplicated data: 
70711


In [78]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

data.isnull().sum()

airline            0
overall            0
author             0
review_date        0
customer_review    0
aircraft           0
traveller_type     0
cabin              0
route              0
date_flown         0
seat_comfort       0
cabin_service      0
food_bev           0
entertainment      0
ground_service     0
value_for_money    0
recommended        0
dtype: int64

In [79]:
data.duplicated().sum()

0

In [80]:
data.shape

(13189, 17)

In [81]:
data['customer_review'] = data['customer_review'].str.lower()  # Customer review to lower register
data['customer_review'].head(5)[7]

'âœ… trip verified | flew on turkish airlines iad-ist-khi and return khi-ist-iad. turkish airlines has consistently maintained its quality since i first flew with them in 2007. the flights leave on time, the catering is excellent, the inflight entertainment is extensive and the interface easy to use, and the cabin crew is excellent. interesting though the a330 on the khi-ist route and return seemed to have more leg room and was newer than the a330 on the iad-ist route which was showing its age. the a330 on the iad-ist route had a slow responding interface for the inflight entertainment and a broken table on the return flight. but turkish airlines will be replacing the a330 on its flight to iad with the 787 sometime in the summer. turkish food was served on the return leg which i personally like, and i saw the cabin staff helping elderly passengers walk to the lavatory which was nice. overall another wonderful experience with turkish airlines.'

In [94]:
data['cleaned_review'] = data['customer_review'].apply(clean_text) # remove symbols from text
data['cleaned_review'] = data['customer_review'].apply(remove_stopwords)  # remove stop words from text
data['cleaned_review'] = data['customer_review'].apply(remove_special_chars)  # remove special symbols from text

In [95]:
data['cleaned_review'][7]

' trip verified | flew on turkish airlines iad-ist-khi and return khi-ist-iad. turkish airlines has consistently maintained its quality since i first flew with them in 2007. the flights leave on time, the catering is excellent, the inflight entertainment is extensive and the interface easy to use, and the cabin crew is excellent. interesting though the a330 on the khi-ist route and return seemed to have more leg room and was newer than the a330 on the iad-ist route which was showing its age. the a330 on the iad-ist route had a slow responding interface for the inflight entertainment and a broken table on the return flight. but turkish airlines will be replacing the a330 on its flight to iad with the 787 sometime in the summer. turkish food was served on the return leg which i personally like, and i saw the cabin staff helping elderly passengers walk to the lavatory which was nice. overall another wonderful experience with turkish airlines.'

In [97]:
data.to_csv('../data/processed/cleaned_data.csv', index=False)

In [90]:
data

Unnamed: 0,airline,overall,author,review_date,customer_review,aircraft,traveller_type,cabin,route,date_flown,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,recommended,cleaned_review
7,Turkish Airlines,10.0,Zeshan Shah,6th May 2019,âœ… trip verified | flew on turkish airlines i...,A330,Solo Leisure,Economy Class,Washington Dulles to Karachi,April 2019,4.0,5.0,5.0,5.0,5.0,5.0,yes,trip verified | flew on turkish airlines iad-...
15,Turkish Airlines,2.0,S Gonser,29th April 2019,âœ… trip verified | basel to cape town via ist...,Boeing 737-800 / A330-300,Solo Leisure,Economy Class,Basel to Cape Town via Istanbul,April 2019,3.0,3.0,2.0,3.0,1.0,2.0,no,trip verified | basel to cape town via istanb...
17,Turkish Airlines,6.0,Sami Osman,29th April 2019,not verified | abu dhabi to luxembourg via ist...,A320 / Boeing 737,Solo Leisure,Economy Class,Abu Dhabi to Luxembourg via Istanbul,April 2019,2.0,3.0,3.0,3.0,3.0,3.0,yes,not verified | abu dhabi to luxembourg via ist...
19,Turkish Airlines,1.0,Norka Idalia Orlando,28th April 2019,âœ… trip verified | the experience with turkis...,A320 / A330,Solo Leisure,Economy Class,Venice to Boston via Istanbul,February 2019,1.0,1.0,1.0,1.0,1.0,1.0,no,trip verified | the experience with turkish a...
29,Turkish Airlines,2.0,Trevor Khurana,24th April 2019,âœ… trip verified | houston to kiev via istanb...,Boeing 777-300,Solo Leisure,Economy Class,Houston to Kiev via Istanbul,March 2019,1.0,3.0,2.0,2.0,1.0,1.0,no,trip verified | houston to kiev via istanbul....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131676,Ukraine International,6.0,A Dunduras,10th October 2015,ukraine international it isn't for everyone. i...,Boeing 767,Solo Leisure,Economy Class,JFK to VNO via KBP,September 2015,2.0,4.0,2.0,1.0,4.0,5.0,yes,ukraine international it isn't for everyone. i...
131680,Ukraine International,7.0,Filip Badziak,6th October 2015,"warsaw to kiev the plane was on time, kiev air...",Boeing 767,Couple Leisure,Economy Class,WAW to BKK via KBP,January 2015,2.0,2.0,2.0,1.0,2.0,5.0,yes,"warsaw to kiev the plane was on time, kiev air..."
131696,Ukraine International,1.0,Nataliya Vasylkevych,6th August 2015,never flying them again and discourage anyone ...,Boeing,Family Leisure,Economy Class,New York to Lviv,July 2015,1.0,1.0,1.0,1.0,1.0,1.0,no,never flying them again and discourage anyone ...
131702,Ukraine International,10.0,Oleksii Maksimov,20th July 2015,clean and comfortable cabin. we were offered a...,Boeing 737,Couple Leisure,Economy Class,KBP to RIX,July 2015,5.0,5.0,5.0,4.0,4.0,4.0,yes,clean and comfortable cabin. we were offered a...
