In [1]:
"""The goal of this notebook is to process the booking.com reviews to the desired format. It includes:
1. Tokenising reviews to sentences. 
2. Removing Named entities from the sentences. 
3. Removing adjectives, adverbs, and other sentiment words from sentences. 
4. Removing stop words from sentences. 
This is only for the english data."""

'The goal of this notebook is to process the booking.com reviews to the desired format. It includes:\n1. Tokenising reviews to sentences. \n2. Removing Named entities from the sentences. \n3. Removing adjectives, adverbs, and other sentiment words from sentences. \n4. Removing stop words from sentences. \nThis is only for the english data.'

In [1]:
#imports
import nltk
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy
import string
import pandas as pd

In [2]:

def text_processing_pipeline(text):
    """text is list of texts"""

    #sentence tokenization
    # sentences = []
    # print("sentence tokenization...")
    # for r in tqdm(text):
    #     sentences.extend(nltk.tokenize.sent_tokenize(r))

    sentences = text

    #NER
    print("named entity removal...")
    nlp = spacy.load("en_core_web_sm")
    for i in tqdm(range(len(sentences))):
        s = sentences[i]
        doc = nlp(s)
        named_entities = [ent.text for ent in doc.ents]
        s = nltk.word_tokenize(s)
        s = [word for word in s if word not in named_entities]
        sentences[i] = " ".join(s)
    
    #sentiment words removal
    print("Removal of adjective words...")
    sentiment_tags = ['JJ','JJR','JJS', 'RBR' ,'RBS', 'CD']
    for i in tqdm(range(len(sentences))):
        s = sentences[i].split()
        pos_tagged = nltk.pos_tag(s)
        sentiment_words = [t[0] for t in pos_tagged if t[1] in sentiment_tags]
        s = [word for word in s if word not in sentiment_words]
        sentences[i] = " ".join(s)

    #case desensitise
    sentences = [s.lower() for s in sentences]

    #lemmatization/stemming
    nlp = spacy.load("en_core_web_sm")
    print("lemmatization...")
    for i in tqdm(range(len(sentences))):
        s = nlp(sentences[i])
        lemmatised_s = []
        for token in s:
            lemmatised_s.append(token.lemma_)
        sentences[i] = " ".join(lemmatised_s)

    #get list of punctuations and remove the word if it is a punctuation
    print("punctuation_removal...")
    punctuations = string.punctuation
    for i in tqdm(range(len(sentences))):
        s = sentences[i]
        tokens = s.split()
        tokens = [t for t in tokens if t not in punctuations]
        sentences[i] = " ".join(tokens)
    
    return sentences

In [6]:
df = pd.read_csv('./data/booking_reviews.csv')
df = df.fillna("")
#get the reviews
reviews = []
for i in tqdm(range(len(df))):
  neg_review = df.iloc[i].neg_review
  pos_review = df.iloc[i].pos_review
  if neg_review != "" and pos_review!= "":
    reviews.append(neg_review + " " + pos_review)

100%|██████████| 100000/100000 [00:27<00:00, 3624.73it/s]


In [7]:
#sentence tokenization
sentences = []
for r in tqdm(reviews):
    sentences.extend(nltk.tokenize.sent_tokenize(r))

100%|██████████| 42065/42065 [00:04<00:00, 9556.90it/s]


In [8]:
#save sentences with pickle
with open("data/booking_sentences.txt", 'w') as f:
    for s in sentences:
        f.writelines(s)
        f.write('\n')