In [1]:
import numpy as np
import pandas as pd
import altair as alt
import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import tkinter
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
from textblob import TextBlob

alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
matplotlib.use('TkAgg')

In [2]:
disney_df = pd.read_csv("../data/raw/DisneylandReviews.csv")
reviews = disney_df['Review_Text']

In [3]:
reviews  

0        If you've ever been to Disneyland anywhere you...
1        Its been a while since d last time we visit HK...
2        Thanks God it wasn   t too hot or too humid wh...
3        HK Disneyland is a great compact park. Unfortu...
4        the location is not in the city, took around 1...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited Disneyland ...
42653    My eleven year old daughter and myself went to...
42654    This hotel, part of the Disneyland Paris compl...
42655    I went to the Disneyparis resort, in 1996, wit...
Name: Review_Text, Length: 42656, dtype: object

In [4]:
def preprocess(text):
    text = text.lower()
    text = text.replace('hk', 'hongkong')
    text = text.replace('hong kong', 'hongkong')
    text = text.replace("n't", " not")
    return text

In [5]:
reviews = reviews.apply(lambda x: preprocess(x))
reviews

0        if you've ever been to disneyland anywhere you...
1        its been a while since d last time we visit ho...
2        thanks god it wasn   t too hot or too humid wh...
3        hongkong disneyland is a great compact park. u...
4        the location is not in the city, took around 1...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited disneyland ...
42653    my eleven year old daughter and myself went to...
42654    this hotel, part of the disneyland paris compl...
42655    i went to the disneyparis resort, in 1996, wit...
Name: Review_Text, Length: 42656, dtype: object

In [6]:
def remove_punc(text):
    clean = ''.join([char for char in text if char not in string.punctuation])
    return clean

In [7]:
reviews = reviews.apply(lambda x: remove_punc(x))
reviews

0        if youve ever been to disneyland anywhere youl...
1        its been a while since d last time we visit ho...
2        thanks god it wasn   t too hot or too humid wh...
3        hongkong disneyland is a great compact park un...
4        the location is not in the city took around 1 ...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited disneyland ...
42653    my eleven year old daughter and myself went to...
42654    this hotel part of the disneyland paris comple...
42655    i went to the disneyparis resort in 1996 with ...
Name: Review_Text, Length: 42656, dtype: object

In [8]:
def remove_digit(text):
    clean = ''.join([i for i in text if not i.isdigit()])
    return clean

In [9]:
reviews = reviews.apply(lambda x: remove_digit(x))
reviews

0        if youve ever been to disneyland anywhere youl...
1        its been a while since d last time we visit ho...
2        thanks god it wasn   t too hot or too humid wh...
3        hongkong disneyland is a great compact park un...
4        the location is not in the city took around  h...
                               ...                        
42651    i went to disneyland paris in july  and though...
42652     adults and  child of  visited disneyland pari...
42653    my eleven year old daughter and myself went to...
42654    this hotel part of the disneyland paris comple...
42655    i went to the disneyparis resort in  with a sm...
Name: Review_Text, Length: 42656, dtype: object

In [10]:
tokenizer = RegexpTokenizer(r'\w+')

In [11]:
reviews_tokenized = reviews.apply(lambda x: tokenizer.tokenize(x.lower()))
reviews_tokenized 

0        [if, youve, ever, been, to, disneyland, anywhe...
1        [its, been, a, while, since, d, last, time, we...
2        [thanks, god, it, wasn, t, too, hot, or, too, ...
3        [hongkong, disneyland, is, a, great, compact, ...
4        [the, location, is, not, in, the, city, took, ...
                               ...                        
42651    [i, went, to, disneyland, paris, in, july, and...
42652    [adults, and, child, of, visited, disneyland, ...
42653    [my, eleven, year, old, daughter, and, myself,...
42654    [this, hotel, part, of, the, disneyland, paris...
42655    [i, went, to, the, disneyparis, resort, in, wi...
Name: Review_Text, Length: 42656, dtype: object

In [12]:
nltk_stopwords = pd.Series(stopwords.words("english"))
nltk_stopwords = nltk_stopwords.apply(lambda x: remove_punc(x))
nltk_stopwords = set(nltk_stopwords)

In [13]:
stopwords_700 = pd.read_csv('../EDA/stopwords.txt', header= None, delimiter = "\t")
stopwords_700 = set(stopwords_700[0])

In [14]:
def remove_stopwords(text): 
    stop_words = set(stopwords.words("english"))
    stop_words_customize = {'disney', 'disneyland', 'land', 'park', 'parks', 'world', 'disneyworld', 'disney world', 'one'}
    stop_words_all = nltk_stopwords.union(stopwords_700, stop_words_customize)
    clean = [word for word in text if not word in stop_words_all]
    return clean

In [15]:
reviews_nostop = reviews_tokenized.apply(lambda x: remove_stopwords(x))

In [16]:
reviews_nostop

0        [find, hongkong, similar, layout, walk, street...
1        [last, time, visit, hongkong, time, stay, tomo...
2        [thanks, god, hot, humid, visiting, big, issue...
3        [hongkong, great, compact, unfortunately, bit,...
4        [location, city, took, hour, kowlon, kids, lik...
                               ...                        
42651    [went, paris, july, thought, brilliant, visite...
42652    [adults, child, visited, paris, beginning, feb...
42653    [eleven, year, old, daughter, went, visit, son...
42654    [hotel, part, paris, complex, wonderful, place...
42655    [went, disneyparis, resort, small, child, minu...
Name: Review_Text, Length: 42656, dtype: object

In [18]:
lemmatizer = WordNetLemmatizer()

def text_lemmatizer(text):
    lemmatized_text = [lemmatizer.lemmatize(i) for i in text]
    return lemmatized_text

In [19]:
reviews_lemmatized = reviews_nostop.apply(lambda x: text_lemmatizer(x))
reviews_lemmatized

0        [find, hongkong, similar, layout, walk, street...
1        [last, time, visit, hongkong, time, stay, tomo...
2        [thanks, god, hot, humid, visiting, big, issue...
3        [hongkong, great, compact, unfortunately, bit,...
4        [location, city, took, hour, kowlon, kid, like...
                               ...                        
42651    [went, paris, july, thought, brilliant, visite...
42652    [adult, child, visited, paris, beginning, feb,...
42653    [eleven, year, old, daughter, went, visit, son...
42654    [hotel, part, paris, complex, wonderful, place...
42655    [went, disneyparis, resort, small, child, minu...
Name: Review_Text, Length: 42656, dtype: object

In [22]:
stemmer = PorterStemmer()

def text_stemmer(text):
    stem_text = ' '.join([stemmer.stem(i) for i in text])
    return stem_text

In [23]:
reviews_clean = reviews_lemmatized.apply(lambda x: text_stemmer(x))
reviews_clean

0        find hongkong similar layout walk street famil...
1        last time visit hongkong time stay tomorrowlan...
2        thank god hot humid visit big issu shadei arri...
3        hongkong great compact unfortun bit mainten wo...
4        locat citi took hour kowlon kid like fine crow...
                               ...                        
42651    went pari juli thought brilliant visit hotel s...
42652    adult child visit pari begin feb absolut fanta...
42653    eleven year old daughter went visit son london...
42654    hotel part pari complex wonder place famili ki...
42655    went disneypari resort small child minut enter...
Name: Review_Text, Length: 42656, dtype: object