In [4]:
pip install ipykernel --upgrade


Collecting ipykernel
  Using cached ipykernel-6.25.0-py3-none-any.whl (153 kB)
Collecting comm>=0.1.1
  Using cached comm-0.1.4-py3-none-any.whl (6.6 kB)
Collecting jupyter-core!=5.0.*,>=4.12
  Using cached jupyter_core-5.3.1-py3-none-any.whl (93 kB)
Installing collected packages: jupyter-core, comm, ipykernel
  Attempting uninstall: jupyter-core
    Found existing installation: jupyter_core 4.11.1
    Uninstalling jupyter_core-4.11.1:
      Successfully uninstalled jupyter_core-4.11.1
  Attempting uninstall: ipykernel
    Found existing installation: ipykernel 6.15.2
    Uninstalling ipykernel-6.15.2:
      Successfully uninstalled ipykernel-6.15.2
Successfully installed comm-0.1.4 ipykernel-6.25.0 jupyter-core-5.3.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.


# Data Cleaning 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import re


In [None]:
cwd = os.getcwd()
df= pd.read_csv(cwd+"/BA_reviews.csv",index_col=0)


In [7]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | Late boarding led to a one ...,5,5th August 2023,(United Kingdom)
1,✅ Trip Verified | As usual the flight is delay...,1,4th August 2023,(United Kingdom)
2,✅ Trip Verified | I had the most fantastic BA...,1,1st August 2023,(Hong Kong)
3,✅ Trip Verified | Couldn’t book in online. Ar...,10,31st July 2023,(United Kingdom)
4,✅ Trip Verified | London Heathrow to Mumbai in...,1,31st July 2023,(Iceland)


In [8]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [9]:
df['verified']

0       True
1       True
2       True
3       True
4       True
        ... 
4195    True
4196    True
4197    True
4198    True
4199    True
Name: verified, Length: 4200, dtype: bool

# cleaning reviews

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
import math

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("Trip verified |")

corpus =[]

for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ',rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [13]:
df['corpus']= corpus 


In [None]:
df.head()

# cleaning/format data

In [15]:
df.dtypes

reviews     object
stars        int64
date        object
country     object
verified      bool
corpus      object
dtype: object

In [16]:
df.date = pd.to_datetime(df.date)

In [17]:
df.date.head()

0   2023-08-05
1   2023-08-04
2   2023-08-01
3   2023-07-31
4   2023-07-31
Name: date, dtype: datetime64[ns]

# cleaning ratings with stars 

In [18]:
df.stars.unique()

array([ 5,  1, 10,  9,  6,  2,  4,  8,  3,  7], dtype=int64)

In [19]:
df.stars.value_counts()

1     1350
2      482
10     408
3      385
5      338
9      329
8      296
4      241
7      217
6      154
Name: stars, dtype: int64

# check for null values 

In [20]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     4200
dtype: int64

In [21]:
df.country.isnull().value_counts()

False    4200
Name: country, dtype: int64

In [22]:
df.drop(df[df.country.isnull()==True].index, axis=0,inplace= True)

In [23]:
df.shape

(4200, 6)

In [24]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | Late boarding led to a one ...,5,2023-08-05,(United Kingdom),True,trip verified late boarding led one hour fligh...
1,✅ Trip Verified | As usual the flight is delay...,1,2023-08-04,(United Kingdom),True,trip verified usual flight delayed ba try blam...
2,✅ Trip Verified | I had the most fantastic BA...,1,2023-08-01,(Hong Kong),True,trip verified fantastic ba flight today cabin ...
3,✅ Trip Verified | Couldn’t book in online. Ar...,10,2023-07-31,(United Kingdom),True,trip verified book online arrived check find b...
4,✅ Trip Verified | London Heathrow to Mumbai in...,1,2023-07-31,(Iceland),True,trip verified london heathrow mumbai boeing bu...
...,...,...,...,...,...,...
4195,✅ Trip Verified | Flew London Gatwick to Tiran...,2,2018-07-30,(United Kingdom),True,trip verified flew london gatwick tirana rinas...
4196,✅ Trip Verified | London Heathrow to Murcia a...,3,2018-07-29,(China),True,trip verified london heathrow murcia back prob...
4197,✅ Trip Verified | London to Amsterdam. BA's s...,7,2018-07-28,(United Kingdom),True,trip verified london amsterdam ba service gone...
4198,✅ Trip Verified | Aberdeen to Abu Dhabi via L...,5,2018-07-27,(United Kingdom),True,trip verified aberdeen abu dhabi via london de...


In [25]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")


In [26]:
df.to_csv(cwd+ "/cleaned-

SyntaxError: EOL while scanning string literal (4140664488.py, line 1)