# Libraries

In [1]:
import sys
pathModulesES = '../sauceforyall/'
sys.path.append(pathModulesES)

**Elasticsearch Query**

In [2]:
from yelpquery import YelpQuery
from pandasticsearch import Select
ye = YelpQuery()

**Machine Learning**

In [6]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

**Visualize**

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

**Others**

In [9]:
import re
import string

**Index name**

In [10]:
index_business = "yelp-business*"
index_review = "yelp-review*"
index_tip = "yelp-tip*"
index_user = "yelp-user*"

# 1. Data Pre-Processing

## 1.1 Load data

Retrieve all the reviews from the last 3 years, this helps reduce the volume of data

In [11]:
firstReview = ye.getRangeTerm(index=index_review, term="date", beginDate="2016-01-01T00:00:00.000",endDate="2019-12-31T23:59:59.000", size=2500)

In [12]:
df_review = ye.getResultScrolling(firstReview)

## 1.2 Text Cleaning

### 1.2.1 Remove Carriage Return and Newline Characters

In [13]:
def remove_carriage(text):
    REPLACE_NEW_LINE = re.compile('\s*\n+\s*')
    text = REPLACE_NEW_LINE.sub(' ', text)
    return text

def remove_spaces(text):
    REPLACE_SPACE = re.compile('\s+')
    text = REPLACE_SPACE.sub(' ', text)
    return text

### 1.2.2 Remove Strange Character

In [14]:
def remove_weird_character(text):
    REPLACE_NO_SPACE = re.compile('[!"#$%&\()/<=>@[\\]^_`{|}~]')
    #REPLACE_NO_SPACE = re.compile('[!"#$%&\()*+-/<=>?@[\\]^_`{|}~]')
    text = REPLACE_NO_SPACE.sub(' ', text)
    return text

### 1.2.3 Remove Contractions

In [15]:
def remove_contractions(text):
    patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'I\'m', 'I am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
    ]
    
    for (pattern, repl) in patterns:
        text = re.sub(pattern, repl, text)
    return text

### 1.2.4 Remove Repeated Characters

In [16]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/hongphuc95/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [17]:
def remove_repeated(word):
    repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    repl = r'\1\2\3'
    if wordnet.synsets(word):
        return word
    
    repl_word = repeat_regexp.sub(repl, word)
    
    if repl_word != word:
        return remove_repeated(repl_word)
    else:
        return repl_word

### 1.2.5 Check Spelling

In [18]:
##TODO
def spelling(word):
    pass

### 1.2.6 Lemmatization

In [19]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(document):
    lemmatizer = WordNetLemmatizer()
    sentences = nltk.sent_tokenize(document)
    
    tagged_sentences = []
    for sent in sentences:
        sent = nltk.word_tokenize(sent)
        corrected_words = []
        for word in sent:
            word = remove_repeated(word)
            #word = spelling(word)
            word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
            corrected_words.append(word)
        
        corrected_words = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in corrected_words]).strip()      
        tagged_sentences.append(corrected_words)
        
    tagged_sentences = " ".join(tagged_sentences)
    
    #lemmatized_sentence = []
    #for word, tag in pos_tag(tokens):
    #    if tag.startswith('NN' ):
    #        pos = wordnet.NOUN 
    #    elif tag.startswith('VB'):
    #        pos = wordnet.VERB
    #    elif tag.startswith('RB'):
    #        pos = wordnet.ADV
    #    elif tag.startswith('JJ'):
    #        pos = wordnet.ADJ
    #    else:
    #        pos = wordnet.NOUN
    #    lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return tagged_sentences

## 1.3 Text Processing

**Example**

In [20]:
#Remove carriage
df_review["text"] = df_review["text"].apply(lambda x: remove_carriage(x))

In [21]:
#Remove weird characters
df_review["text"] = df_review["text"].apply(lambda x: remove_weird_character(x))

In [22]:
#Remove double whitespace
df_review["text"] = df_review["text"].apply(lambda x: remove_spaces(x))

In [23]:
#Remove contractions
df_review["text"] = df_review["text"].apply(lambda x: remove_contractions(x))

In [24]:
#Lemmatization
df_review["text"] = df_review["text"].apply(lambda x: lemmatize_sentence(x))

In [25]:
df_review = df_review.drop("_index", axis = 1)
df_review = df_review.drop("_type", axis = 1)
df_review = df_review.drop("_id", axis = 1)
df_review = df_review.drop("@timestamp", axis = 1)
df_review = df_review.drop("@version", axis = 1)
df_review = df_review.drop("_score", axis = 1)

In [24]:
df_review.to_json("./review_cleaned_2016_2019.json", orient='records', lines=True)