In [None]:
import os
import time
import datetime
from google.colab import drive
import pandas as pd
import seaborn as sns
import numpy as np
import random
import matplotlib.pyplot as plt
% matplotlib inline
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet
import re
import spacy
from spacy_readability import Readability
# pip install spacy-readability

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing

In [None]:
review = {}
review['raw'] = pd.read_csv("/content/drive/My Drive/NLP/yelp_restaurant_only_review.csv")
review['raw'] = review['raw'].sort_values("text")
review['filtered'] = review['raw'][review['raw'].stars==5.0]["text"]
review['filtered'] = review['filtered'].drop_duplicates()

In [None]:
review['preprocessed'] = review['filtered'].to_frame()
review['preprocessed']["text"] = review['preprocessed']["text"].apply(lambda x: x.replace("\n"," "))
review['preprocessed']["text"] = review['preprocessed']["text"].apply(lambda x: x.replace(u'\xa0', u' '))
review['preprocessed']["text"] = review['preprocessed']["text"].apply(lambda x: x.replace(u'\u2006', u' '))
review['preprocessed']["text"] = review['preprocessed']["text"].apply(lambda x: x.replace(u'\u2009', u' '))
review['preprocessed']["text"] = review['preprocessed']["text"].apply(lambda x: x.replace(u'\u3000', u' '))

In [None]:
# common preprocess

def sentence_preprocess(x):

    #keep english review
    pattern=re.compile(r'[A-Za-z0-9]*\s+[<=>#$%&?.!,"{}()]*[A-Za-z0-9]*[<=>#$%&?.!,"{}()]*')
    text_1=pattern.findall(x)
    step_1=''.join(map(str, text_1)).strip()
    
    #remove duplicate punctuation  
    newtext = []
    for k, g in groupby(step_1):
        if k in punctuation:
            newtext.append(k)
        else:
            newtext.extend(g)
    step_2=''.join(newtext)

    #lower
    lower=step_2.lower()

    #capitalize by sentence
    sent=sent_tokenize(lower)
    capitalized_sent=[]

    for i in sent:
      capitalized_sent.append(i.capitalize())
    step_3=' '.join(map(str, capitalized_sent)).strip()
    
    # replace single i to I
    pattern2=re.compile(r'[.?!_$@\'\s][i][.?!_$@\'\s]')
    if pattern2.findall(step_3) != None:      
      step_4 = re.sub(pattern2, ' I ', step_3)
    
    #remove unwanted punctuations
    remove_punc=set(punctuation)-set([".",",","?","'","!","_","$","@"])
    for i in step_4:
      if i in remove_punc:
        step_4= step_4.replace(i," ")

    #remove duplicate space
    step_5=re.sub(' +', ' ',step_4 )
    return step_5

In [None]:
review['preprocessed1'] = review['preprocessed']["text"].apply(lambda x: sentence_preprocess(x))
review['preprocessed1'] = review['preprocessed1'].loc[(review['preprocessed1'].str.len() < 600) & (review['preprocessed1'].str.len() > 60)]
review['preprocessed1'].reset_index(inplace=True,drop=True)
review['preprocessed1']

In [None]:
# recovery common abbreviation
def abbreviation_return(word):
  replacement_patterns = [
    (r'^won\'t$', 'will not'),
    (r'^wont$', 'will not'),
    (r'^can\'t$', 'cannot'),
    (r'^cant$', 'cannot'),
    (r'^didnt$', 'did not'),
    (r'^dont$', 'do not'),
    (r'^doesnt$', 'does not'),
    (r'^i\'m$', 'I am'),
    (r'^im$', 'I am'),
    (r'^Im$', 'I am'),
    (r'^ive$', 'I have'),
    (r'^Ive$', 'I have'),
    (r'^ain\'t$', 'is not'),
    (r'^aint$', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')]
  patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]
  for (pattern, repl) in patterns:
      if pattern.search(word) != None:
        return re.subn(pattern, repl, word)[0]
  return word

def recovery(x):
  words= nltk.word_tokenize(x)
  return ' '.join(map(str, [abbreviation_return(item) for item in words])).strip()

review['preprocessed2']=review['preprocessed1'].apply(lambda x: recovery(x))

In [None]:
# remove the word like "sooooooo" or "gooooooood" 
def Remove_repeat_word(sentence):
  words= nltk.word_tokenize(sentence)
  pattern = re.compile(r'^(\w*)(\w)\2(\w*)$')
  repl = r'\1\2\3'
  for i in range(len(words)):
    while (pattern.findall(words[i]) != []) & (wordnet.synsets(words[i]) == []):
      words[i]=pattern.sub(repl, words[i])
  return ' '.join(map(str, words))

review['preprocessed3']=review['preprocessed2'].apply(lambda x: Remove_repeat_word(x))

In [None]:

nlp = spacy.load('en')
read = Readability()
nlp.add_pipe(read, last=True)

scores = []
for i in review['preprocessed3']:
  doc = nlp(i)
  dict={"text":i,\
        "automated_readability_index":doc._.automated_readability_index}
  scores.append(dict)
scores = pd.DataFrame(scores)


In [None]:
scores.to_csv("/content/drive/My Drive/NLP/review_with_scores.csv")