In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [13]:
data = pd.read_csv("data/BA_reviews.csv")

In [14]:
data.head(4)

Unnamed: 0.1,Unnamed: 0,reviews
0,0,Not Verified | Happy with the flight crew. Ha...
1,1,✅ Trip Verified | Horrible service from boar...
2,2,Not Verified | My wife and I are very disappo...
3,3,Not Verified | We flew BA between Heathrow an...


In [15]:
data.drop(columns='Unnamed: 0', inplace=True)

In [16]:
data.head()

Unnamed: 0,reviews
0,Not Verified | Happy with the flight crew. Ha...
1,✅ Trip Verified | Horrible service from boar...
2,Not Verified | My wife and I are very disappo...
3,Not Verified | We flew BA between Heathrow an...
4,Not Verified | Absolutely disgusted with BA. ...


In [None]:
data.info()

In [37]:
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated().sum() 

In [17]:
# clean the reviews column, validation from the text

data.reviews = data.reviews.str.split("|", expand=True).get(1)

In [18]:
data.head()

Unnamed: 0,reviews
0,Happy with the flight crew. Hadn't been on a...
1,Horrible service from boarding to landing. ...
2,My wife and I are very disappointed with fly...
3,We flew BA between Heathrow and Berlin one w...
4,Absolutely disgusted with BA. Our flights we...


##### Preprocess the review column for natural language processing


In [19]:
import nltk 
from nltk.corpus import stopwords
import re 


In [25]:
# download the stopwords 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/ksilas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ksilas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ksilas/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [30]:
# text preprocessing function
def preprocess(text):
    # Remove special characters and digits
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = text.lower()  # Lowercase text
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

In [31]:
# apply the text preprocessing function to the reviews dataset
data['processed_reviews'] = data.reviews.apply(preprocess)
print(data['processed_reviews'][])

0       [happy, flight, crew, plane, 20, years, stress...
1       [horrible, service, boarding, landing, flew, l...
2       [wife, disappointed, flying, british, airways,...
3       [flew, ba, heathrow, berlin, one, way, connect...
4       [absolutely, disgusted, ba, flights, cancelled...
                              ...                        
1995    [los, angeles, london, heathrow, starting, cha...
1996    [gatwick, barcelona, unimpressed, new, board, ...
1997    [london, heathrow, rio, de, janeiro, 45, minut...
1998    [london, heathrow, new, york, paid, 250, pre, ...
1999    [flew, british, airways, economy, tampa, athen...
Name: processed_reviews, Length: 2000, dtype: object


In [33]:
data.head()

Unnamed: 0,reviews,processed_reviews
0,Happy with the flight crew. Hadn't been on a...,"[happy, flight, crew, plane, 20, years, stress..."
1,Horrible service from boarding to landing. ...,"[horrible, service, boarding, landing, flew, l..."
2,My wife and I are very disappointed with fly...,"[wife, disappointed, flying, british, airways,..."
3,We flew BA between Heathrow and Berlin one w...,"[flew, ba, heathrow, berlin, one, way, connect..."
4,Absolutely disgusted with BA. Our flights we...,"[absolutely, disgusted, ba, flights, cancelled..."


#### Create a Dictionary and Corpus
##### create a dictionary and corpus from the preprocessed text data.



In [34]:
from gensim import corpora 

In [35]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(data['processed_reviews'])
corpus = [dictionary.doc2bow(review) for review in data['processed_reviews']]

TypeError: doc2bow expects an array of unicode tokens on input, not a single string