In [56]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [64]:
# nltk variables

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bekezhanissabek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
## All methods

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    text = text.strip()
    return text
    
def remove_stopwords(text):
    words = text.split()
    filtered_text = [word for word in words if word not in stop_words]
    return ' '.join(filtered_text)

def remove_special_chars(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bekezhanissabek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
data = pd.read_excel('../data/raw/capstone_airline_reviews3.xlsx')

In [31]:
data.head()

Unnamed: 0,airline,overall,author,review_date,customer_review,aircraft,traveller_type,cabin,route,date_flown,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,recommended
0,,,,,,,,,,,,,,,,,
1,Turkish Airlines,7.0,Christopher Hackley,8th May 2019,âœ… Trip Verified | London to Izmir via Istanb...,,Business,Economy Class,London to Izmir via Istanbul,2019-05-01 00:00:00,4.0,5.0,4.0,4.0,2.0,4.0,yes
2,,,,,,,,,,,,,,,,,
3,Turkish Airlines,2.0,Adriana Pisoi,7th May 2019,âœ… Trip Verified | Istanbul to Bucharest. We ...,,Family Leisure,Economy Class,Istanbul to Bucharest,2019-05-01 00:00:00,4.0,1.0,1.0,1.0,1.0,1.0,no
4,,,,,,,,,,,,,,,,,


In [32]:
data.shape

(131895, 17)

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131895 entries, 0 to 131894
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          65947 non-null  object 
 1   overall          64017 non-null  float64
 2   author           65947 non-null  object 
 3   review_date      65947 non-null  object 
 4   customer_review  65947 non-null  object 
 5   aircraft         19718 non-null  object 
 6   traveller_type   39755 non-null  object 
 7   cabin            63303 non-null  object 
 8   route            39726 non-null  object 
 9   date_flown       39633 non-null  object 
 10  seat_comfort     60681 non-null  float64
 11  cabin_service    60715 non-null  float64
 12  food_bev         52608 non-null  float64
 13  entertainment    44193 non-null  float64
 14  ground_service   39358 non-null  float64
 15  value_for_money  63975 non-null  float64
 16  recommended      64440 non-null  object 
dtypes: float64

In [34]:
data.describe()

Unnamed: 0,overall,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money
count,64017.0,60681.0,60715.0,52608.0,44193.0,39358.0,63975.0
mean,5.14543,2.95216,3.191814,2.90817,2.863372,2.69282,2.943962
std,3.477532,1.441362,1.565789,1.481893,1.507262,1.612215,1.58737
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,1.0,1.0,1.0,1.0
50%,5.0,3.0,3.0,3.0,3.0,3.0,3.0
75%,9.0,4.0,5.0,4.0,4.0,4.0,4.0
max,10.0,5.0,5.0,5.0,5.0,5.0,5.0


<h2>Data cleaning</h2>

In [35]:
print(f"Count of null data:\n{data.isnull().sum()}")

Count of null data:
airline             65948
overall             67878
author              65948
review_date         65948
customer_review     65948
aircraft           112177
traveller_type      92140
cabin               68592
route               92169
date_flown          92262
seat_comfort        71214
cabin_service       71180
food_bev            79287
entertainment       87702
ground_service      92537
value_for_money     67920
recommended         67455
dtype: int64


In [36]:
print(f"Count of duplicated data: \n{data.duplicated().sum()}")

Count of duplicated data: 
70711


In [37]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

data.isnull().sum()

airline            0
overall            0
author             0
review_date        0
customer_review    0
aircraft           0
traveller_type     0
cabin              0
route              0
date_flown         0
seat_comfort       0
cabin_service      0
food_bev           0
entertainment      0
ground_service     0
value_for_money    0
recommended        0
dtype: int64

In [38]:
data.duplicated().sum()

0

In [39]:
data.shape

(13189, 17)

In [54]:
data['customer_review'] = data['customer_review'].str.lower()  # Customer review to lower register
data['customer_review'].head(5)[7]

'âœ… trip verified | flew on turkish airlines iad-ist-khi and return khi-ist-iad. turkish airlines has consistently maintained its quality since i first flew with them in 2007. the flights leave on time, the catering is excellent, the inflight entertainment is extensive and the interface easy to use, and the cabin crew is excellent. interesting though the a330 on the khi-ist route and return seemed to have more leg room and was newer than the a330 on the iad-ist route which was showing its age. the a330 on the iad-ist route had a slow responding interface for the inflight entertainment and a broken table on the return flight. but turkish airlines will be replacing the a330 on its flight to iad with the 787 sometime in the summer. turkish food was served on the return leg which i personally like, and i saw the cabin staff helping elderly passengers walk to the lavatory which was nice. overall another wonderful experience with turkish airlines.'

In [65]:
data['cleaned_review'] = data['customer_review'].apply(clean_text) # remove symbols from text
data['cleaned_review'] = data['customer_review'].apply(remove_stopwords)  # remove stop words from text
data['cleaned_review'] = data['customer_review'].apply(remove_special_chars)  # remove special symbols from text

In [66]:
data['cleaned_review'][7]

' trip verified | flew on turkish airlines iad-ist-khi and return khi-ist-iad. turkish airlines has consistently maintained its quality since i first flew with them in 2007. the flights leave on time, the catering is excellent, the inflight entertainment is extensive and the interface easy to use, and the cabin crew is excellent. interesting though the a330 on the khi-ist route and return seemed to have more leg room and was newer than the a330 on the iad-ist route which was showing its age. the a330 on the iad-ist route had a slow responding interface for the inflight entertainment and a broken table on the return flight. but turkish airlines will be replacing the a330 on its flight to iad with the 787 sometime in the summer. turkish food was served on the return leg which i personally like, and i saw the cabin staff helping elderly passengers walk to the lavatory which was nice. overall another wonderful experience with turkish airlines.'

In [67]:
data.to_csv('../data/processed/cleaned_data.csv', index=False)