In [None]:
"""The goal of this notebook is to process the booking.com reviews to the desired format. It includes:
1. Tokenising reviews to sentences. 
2. Removing Named entities from the sentences. 
3. Removing adjectives, adverbs, and other sentiment words from sentences. 
4. Removing stop words from sentences. 
This is only for the english data."""

In [1]:
#install desired packages
# !pip install nltk -q
# !pip install spacy -q
# !pip install numpy -q
# !pip install pandas -q
# nltk.download('all')
# spacy.load('en_core_web_sm')
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

In [17]:
#imports
import nltk
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy
import string

In [13]:
df = pd.read_csv('./booking_reviews.csv')
df = df.fillna("")
#get the reviews
reviews = []
for i in tqdm(range(len(df))):
  neg_review = df.iloc[i].neg_review
  pos_review = df.iloc[i].pos_review
  if neg_review != "" and pos_review!= "":
    reviews.append(neg_review + " " + pos_review)


100%|██████████| 100000/100000 [00:25<00:00, 3848.26it/s]


In [14]:
reviews[:10]

["I think breakfast could be improved. I have never stayed before at Hotel Hetman and I must admit it was a great discovery.  The hotel is located in the Praga neighborhood of Warsaw, which was once run down, but now is enjoying a renaissance, fueled by the new metro line; the Wilenska stop which is literally right next to the hotel.  Another option of traveling to central Warsaw is by tram, which is also conveniently close.  Another advantage of Hetman is the proximity to Le Cedre, Warsaw's leading Lebanese restaurant for over 20 years.",
 'Some cleaning issues but these were resolved immediately. There were a few maintenance issues. Great views, comfy beds, lovely hot tub.',
 "Breakfast was a bit disappointing, as there wasn't much choice. Probably because of the weird times we are living now, but still other options could have been offered The staff is really friendly and gives you tips about the island.Our room was amazing, with a great view!The location is also good, as you are ju

In [32]:
#sentence tokenization
sentences = []
for r in tqdm(reviews):
    sentences.extend(nltk.tokenize.sent_tokenize(r))

100%|██████████| 42065/42065 [00:04<00:00, 9998.06it/s] 


In [34]:
#named entity removal using spacy
nlp = spacy.load("en_core_web_sm")
for i in tqdm(range(len(sentences))):
  s = sentences[i]
  doc = nlp(s)
  named_entities = [ent.text for ent in doc.ents]
  s = nltk.word_tokenize(s)
  s = [word for word in s if word not in named_entities]
  sentences[i] = " ".join(s)
  

100%|██████████| 148114/148114 [16:14<00:00, 152.04it/s]


In [38]:
#sentiment words removal
sentiment_tags = ['JJ','JJR','JJS', 'RBR' ,'RBS', 'CD']
for i in tqdm(range(len(sentences))):
    s = sentences[i].split()
    pos_tagged = nltk.pos_tag(s)
    sentiment_words = [t[0] for t in pos_tagged if t[1] in sentiment_tags]
    s = [word for word in s if word not in sentiment_words]
    sentences[i] = " ".join(s)


In [43]:
#stop word removal -> technically in top2vec, no need to remove
# stopwords = nltk.corpus.stopwords.words("english")
# for i in tqdm(range(len(sentences))):
#   s = sentences[i].split()
#   s = [word for word in s if word not in stopwords]
#   sentences[i] = " ".join(s)
sentences = [s.lower() for s in sentences]

In [50]:
#lemmatization/stemming
import spacy
nlp = spacy.load("en_core_web_sm")

for i in tqdm(range(len(sentences))):
    s = nlp(sentences[i])
    lemmatised_s = []
    for token in s:
        lemmatised_s.append(token.lemma_)
    sentences[i] = " ".join(lemmatised_s)
#test
sentences[13]

100%|██████████| 148114/148114 [18:16<00:00, 135.12it/s]


'I ’ m pleased we choose the side as it be shelter ( and believe I , it can get ! )'

In [52]:
#get list of punctuations and remove the word if it is a punctuation
import string
punctuations = string.punctuation
for i in tqdm(range(len(sentences))):
    s = sentences[i]
    tokens = s.split()
    tokens = [t for t in tokens if t not in punctuations]
    sentences[i] = " ".join(tokens)

100%|██████████| 148114/148114 [00:00<00:00, 200088.94it/s]


In [54]:
#Spell Correction
#Since training our own model, no need to do spell correction
#But there should be some cut off.  
sentences[1000]   

'same of the staff be lack training'

In [55]:
len(sentences)

148114

In [61]:
#save sentences with pickle
with open("booking_sentences_processed.txt", 'w') as f:
    for s in sentences:
        f.writelines(s)
        f.write('\n')

In [26]:
def text_processing_pipeline(text):
    """text is list of texts"""

    #sentence tokenization
    # sentences = []
    # print("sentence tokenization...")
    # for r in tqdm(text):
    #     sentences.extend(nltk.tokenize.sent_tokenize(r))

    sentences = text

    #NER
    print("named entity removal...")
    nlp = spacy.load("en_core_web_sm")
    for i in tqdm(range(len(sentences))):
        s = sentences[i]
        doc = nlp(s)
        named_entities = [ent.text for ent in doc.ents]
        s = nltk.word_tokenize(s)
        s = [word for word in s if word not in named_entities]
        sentences[i] = " ".join(s)
    
    #sentiment words removal
    print("Removal of adjective words...")
    sentiment_tags = ['JJ','JJR','JJS', 'RBR' ,'RBS', 'CD']
    for i in tqdm(range(len(sentences))):
        s = sentences[i].split()
        pos_tagged = nltk.pos_tag(s)
        sentiment_words = [t[0] for t in pos_tagged if t[1] in sentiment_tags]
        s = [word for word in s if word not in sentiment_words]
        sentences[i] = " ".join(s)

    #case desensitise
    sentences = [s.lower() for s in sentences]

    #lemmatization/stemming
    nlp = spacy.load("en_core_web_sm")
    print("lemmatization...")
    for i in tqdm(range(len(sentences))):
        s = nlp(sentences[i])
        lemmatised_s = []
        for token in s:
            lemmatised_s.append(token.lemma_)
        sentences[i] = " ".join(lemmatised_s)

    #get list of punctuations and remove the word if it is a punctuation
    print("punctuation_removal...")
    punctuations = string.punctuation
    for i in tqdm(range(len(sentences))):
        s = sentences[i]
        tokens = s.split()
        tokens = [t for t in tokens if t not in punctuations]
        sentences[i] = " ".join(tokens)
    
    return sentences

In [27]:
#load sts dataset for evaluation
with open('sts-mt.csv', 'r') as f:
  sts_lines = f.readlines() 

sts_df = []
for l in sts_lines:
  sts_df.append(l[:-1].split('\t'))

sentences1, sentences2, scores = [], [], []
for l in sts_df:
  scores.append(l[3])
  sentences1.append(l[4])
  sentences2.append(l[5])
#scores are from 0 to 5. So the following line normalizes the score
scores = [float(s)/5.0 for s in scores]

In [28]:
sentences1 = text_processing_pipeline(sentences1)
sentences2 = text_processing_pipeline(sentences2)

named entity removal...
100%|██████████| 1836/1836 [00:31<00:00, 58.77it/s]
  3%|▎         | 61/1836 [00:00<00:02, 608.32it/s]Removal of adjective words...
100%|██████████| 1836/1836 [00:03<00:00, 488.05it/s]
  0%|          | 8/1836 [00:00<00:26, 67.80it/s]lemmatization...
100%|██████████| 1836/1836 [00:29<00:00, 61.51it/s]
100%|██████████| 1836/1836 [00:00<00:00, 97134.70it/s]
punctuation_removal...
named entity removal...
100%|██████████| 1836/1836 [00:34<00:00, 52.79it/s]
  4%|▎         | 66/1836 [00:00<00:02, 655.12it/s]Removal of adjective words...
100%|██████████| 1836/1836 [00:04<00:00, 439.87it/s]
  0%|          | 6/1836 [00:00<00:34, 52.64it/s]lemmatization...
100%|██████████| 1836/1836 [00:30<00:00, 61.06it/s]
100%|██████████| 1836/1836 [00:00<00:00, 90952.22it/s]punctuation_removal...



In [30]:
#save the files
# os.mkdir('sts_processed')

with open('sts_processed/sentences1.txt', 'w') as f:
    for s in sentences1:
        f.write(s)
        f.write('\n')
with open('sts_processed/sentences2.txt', 'w') as f:
    for s in sentences2:
        f.write(s)
        f.write('\n')
with open('sts_processed/scores.txt', 'w') as f:
    for score in scores:
        f.write(str(score))
        f.write('\n')

In [31]:
with open('sts_processed/sentences1.txt', 'r') as f:
    temp = f.readlines()
len(temp)


1836