In [1]:
#import libraries
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')
from nltk.tokenize import word_tokenize

In [2]:
#import VoC data as dataframe
reviews_df = pd.read_csv('disney_reviews.csv', encoding='latin1')
reviews_df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1..."
...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to...
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris compl..."


In [13]:
#make text a series

reviews = reviews_df.Review_Text
reviews

0        If you've ever been to Disneyland anywhere you...
1        Its been a while since d last time we visit HK...
2        Thanks God it wasn   t too hot or too humid wh...
3        HK Disneyland is a great compact park. Unfortu...
4        the location is not in the city, took around 1...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited Disneyland ...
42653    My eleven year old daughter and myself went to...
42654    This hotel, part of the Disneyland Paris compl...
42655    I went to the Disneyparis resort, in 1996, wit...
Name: Review_Text, Length: 42656, dtype: object

In [14]:
#create empty list and stem all words
stems = []

for word in reviews:
       stems.append(snowball.stem(word))

In [15]:
#import NLP packages to count words and make them into vectors

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
#fit to stemmed list
vect = CountVectorizer(stop_words='english', max_df=0.5, ngram_range=(1, 1), max_features=10000)
X_train  = vect.fit_transform(stems)

In [17]:
#print last 50 features out

print((vect.get_feature_names()[-50:]))

['yawning', 'yay', 'yea', 'yeah', 'year', 'yearly', 'years', 'yell', 'yelled', 'yelling', 'yellow', 'yep', 'yes', 'yesterday', 'yeti', 'yi', 'yikes', 'yo', 'yoghurt', 'yoghurts', 'yogurt', 'york', 'youll', 'young', 'younger', 'youngest', 'youngster', 'youngsters', 'youre', 'youth', 'youtube', 'youve', 'yr', 'yrs', 'yuck', 'yuk', 'yum', 'yummy', 'yup', 'zealand', 'zero', 'zig', 'zip', 'zocalo', 'zombies', 'zone', 'zones', 'zoo', 'zoom', 'zurg']


In [18]:
#print stop words

print((vect.get_stop_words()))

frozenset({'anyone', 'are', 'more', 'whatever', 'on', 'him', 'thence', 'few', 'those', 'next', 'many', 'how', 'thereupon', 'move', 'therefore', 'cry', 'too', 'as', 'amount', 'they', 'themselves', 'may', 'upon', 'done', 'latter', 'was', 'amoungst', 'take', 'seemed', 'becomes', 'mill', 'side', 'two', 'somehow', 'noone', 'almost', 'none', 'eleven', 'their', 'so', 'hereby', 'after', 'con', 'herein', 'nine', 'sometime', 'anything', 'and', 'thereby', 'for', 'thus', 'meanwhile', 'whenever', 'indeed', 'ie', 'thereafter', 'himself', 'my', 'behind', 'either', 'under', 'be', 'most', 'ltd', 'wherein', 'since', 'about', 'that', 'without', 'least', 'describe', 'via', 'will', 'myself', 'who', 'there', 'must', 'before', 'below', 'nothing', 'sincere', 'often', 'once', 'between', 'ours', 'been', 'against', 'one', 'someone', 'whereby', 'serious', 'fill', 'whence', 're', 'ourselves', 'another', 'several', 'in', 'into', 'cannot', 'hereafter', 'etc', 'bill', 'had', 'found', 'both', 'onto', 'whereas', 'me', 

In [22]:
#This function gets the top 500 most frequent features that appear in the ccb reviews

def get_count_top_features(reviews,n_top=1000):
  vect = CountVectorizer(stop_words='english', max_df=0.5, ngram_range=(1, 1), max_features=1000)
  count = vect.fit_transform(stems)
  importance = np.argsort(np.asarray(count.sum(axis=0)).ravel())[::-1]
  count_feature_names = np.array(vect.get_feature_names())
  return count_feature_names[importance[:n_top]]

get_count_top_features(stems)

array(['disney', 'rides', 'disneyland', 'time', 'day', 'ride', 'just',
       'great', 'food', 'kids', 'place', 'good', 'really', 'like',
       'visit', 'went', 'people', 'long', 'fast', 'fun', 'parks', 'pass',
       'wait', 'experience', 'times', 'days', 'did', 'parade', 'world',
       'don', 'year', 'children', 'staff', 'characters', 'mountain',
       'got', 'line', 'lines', 'paris', 'old', 'family', 'worth',
       'fireworks', 'attractions', 'going', 'small', 'little', 'hotel',
       'queues', 'best', 'lot', 'expensive', 'loved', 'trip', 'make',
       'queue', 'minutes', 'didn', 'amazing', 'love', 'better', 'way',
       'magic', 'big', 'early', 'closed', 'hours', 'california',
       'visited', 'magical', 'years', 'want', 'enjoyed', 'enjoy', 'hour',
       'night', 'tickets', '10', 'space', 'need', 'sure', 'waiting',
       'think', 'main', 'say', 'shows', 'recommend', 'nice', 'mickey',
       'busy', 'bit', 'new', 'come', 'money', 'took', 'lots', 'things',
       'use', 'ti

In [25]:
#counts the top 1000 features
reviews = stems
n_top=1000  # was 500

vect = CountVectorizer(stop_words='english', max_df=0.5, ngram_range=(1, 1), max_features=10000)
count = vect.fit_transform(reviews)
importance = np.argsort(np.asarray(count.sum(axis=0)).ravel())[::-1]
count_feature_names = np.array(vect.get_feature_names())
count_feature_names[importance[:n_top]]

array(['disney', 'rides', 'disneyland', 'time', 'day', 'ride', 'just',
       'great', 'food', 'kids', 'place', 'good', 'really', 'like',
       'visit', 'went', 'people', 'long', 'fast', 'fun', 'parks', 'pass',
       'wait', 'experience', 'times', 'days', 'did', 'parade', 'world',
       'don', 'year', 'children', 'staff', 'characters', 'mountain',
       'got', 'line', 'lines', 'paris', 'old', 'family', 'worth',
       'fireworks', 'attractions', 'going', 'small', 'little', 'hotel',
       'queues', 'best', 'lot', 'expensive', 'loved', 'trip', 'make',
       'queue', 'minutes', 'didn', 'amazing', 'love', 'better', 'way',
       'magic', 'big', 'early', 'closed', 'hours', 'california',
       'visited', 'magical', 'years', 'want', 'enjoyed', 'enjoy', 'hour',
       'night', 'tickets', '10', 'space', 'need', 'sure', 'waiting',
       'think', 'main', 'say', 'shows', 'recommend', 'nice', 'mickey',
       'busy', 'bit', 'new', 'come', 'money', 'took', 'lots', 'things',
       'use', 'ti

In [26]:
vect = CountVectorizer(stop_words='english', max_df=0.5, ngram_range=(1, 1), max_features=10000)
vect

CountVectorizer(max_df=0.5, max_features=10000, stop_words='english')

In [27]:
count = vect.fit_transform(reviews)

count

<42656x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 1947068 stored elements in Compressed Sparse Row format>

In [28]:
vect.vocabulary_

{'ve': 9551,
 'disneyland': 2872,
 'll': 5334,
 'hong': 4508,
 'kong': 5108,
 'similar': 8040,
 'layout': 5194,
 'walk': 9671,
 'main': 5471,
 'street': 8547,
 'familiar': 3578,
 'feel': 3665,
 'rides': 7521,
 'small': 8136,
 'world': 9912,
 'absolutely': 402,
 'fabulous': 3530,
 'worth': 9922,
 'doing': 2948,
 'day': 2551,
 'visited': 9624,
 'fairly': 3563,
 'hot': 4553,
 'relatively': 7291,
 'busy': 1523,
 'queues': 7048,
 'moved': 5884,
 'time': 9044,
 'visit': 9623,
 'hk': 4465,
 'stay': 8457,
 'tomorrowland': 9105,
 'aka': 611,
 'marvel': 5560,
 'land': 5142,
 'iron': 4914,
 'man': 5506,
 'experience': 3472,
 'newly': 6022,
 'open': 6202,
 'ant': 739,
 'wasp': 9724,
 'ironman': 4917,
 'great': 4196,
 'feature': 3654,
 'exciting': 3411,
 'especially': 3321,
 'scenery': 7740,
 'central': 1738,
 'area': 817,
 'kowloon': 5110,
 'changed': 1767,
 'previous': 6848,
 'buzz': 1534,
 'lightyear': 5279,
 'expecting': 3460,
 'boys': 1380,
 'like': 5281,
 'space': 8269,
 'mountain': 5874,
 't

In [30]:
# are these numbers the count of each word? that would make sense.... but no
# we can see better what is going on if we sort by the values on the right

dict(sorted(vect.vocabulary_.items(), key=lambda x: x[1]))

# the above code is ugly, weird and unnecessary as python will do it for us like this
vect.get_feature_names()

['00',
 '000',
 '00am',
 '00pm',
 '01',
 '02',
 '02m',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '0800',
 '09',
 '0900',
 '10',
 '100',
 '1000',
 '100hk',
 '100hkd',
 '101',
 '102',
 '102cm',
 '1030',
 '1030am',
 '104',
 '105',
 '10am',
 '10k',
 '10min',
 '10mins',
 '10minutes',
 '10pm',
 '10th',
 '10x',
 '10years',
 '10yrs',
 '11',
 '110',
 '1100',
 '112',
 '115',
 '11am',
 '11pm',
 '11th',
 '12',
 '120',
 '1200',
 '120cm',
 '125',
 '129',
 '12am',
 '12pm',
 '12th',
 '13',
 '130',
 '1300',
 '135',
 '137',
 '139',
 '13th',
 '14',
 '140',
 '1400',
 '14th',
 '15',
 '150',
 '1500',
 '155',
 '15am',
 '15euros',
 '15min',
 '15mins',
 '15minutes',
 '15pm',
 '15th',
 '16',
 '160',
 '1600',
 '16th',
 '17',
 '170',
 '1700',
 '1730',
 '175',
 '17th',
 '18',
 '180',
 '1800',
 '18th',
 '19',
 '1930',
 '1950',
 '1950s',
 '1955',
 '1956',
 '1957',
 '1959',
 '1960',
 '1966',
 '1967',
 '1970',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',

In [32]:
importance = np.argsort(np.asarray(count.sum(axis=0)).ravel())[::-1]
#count_feature_names = np.array(vect.get_feature_names())
#count_feature_names[importance[:n_top]]

importance

array([2870, 7521, 2872, ..., 2241, 6817, 4041], dtype=int32)

In [33]:
vect.get_feature_names()[999]

'await'

In [34]:
count_feature_names = np.array(vect.get_feature_names())
#count_feature_names[importance[:n_top]]

count_feature_names

array(['00', '000', '00am', ..., 'zoo', 'zoom', 'zurg'], dtype='<U15')

In [36]:
count_feature_names[importance[:n_top]]

array(['disney', 'rides', 'disneyland', 'time', 'day', 'ride', 'just',
       'great', 'food', 'kids', 'place', 'good', 'really', 'like',
       'visit', 'went', 'people', 'long', 'fast', 'fun', 'parks', 'pass',
       'wait', 'experience', 'times', 'days', 'did', 'parade', 'world',
       'don', 'year', 'children', 'staff', 'characters', 'mountain',
       'got', 'line', 'lines', 'paris', 'old', 'family', 'worth',
       'fireworks', 'attractions', 'going', 'small', 'little', 'hotel',
       'queues', 'best', 'lot', 'expensive', 'loved', 'trip', 'make',
       'queue', 'minutes', 'didn', 'amazing', 'love', 'better', 'way',
       'magic', 'big', 'early', 'closed', 'hours', 'california',
       'visited', 'magical', 'years', 'want', 'enjoyed', 'enjoy', 'hour',
       'night', 'tickets', '10', 'space', 'need', 'sure', 'waiting',
       'think', 'main', 'say', 'shows', 'recommend', 'nice', 'mickey',
       'busy', 'bit', 'new', 'come', 'money', 'took', 'lots', 'things',
       'use', 'ti

In [37]:
np.asarray(count.sum(axis=0)).ravel()[importance[:n_top]]

name_count = dict(zip(count_feature_names[importance[:n_top]], np.asarray(count.sum(axis=0)).ravel()[importance[:n_top]]))

In [38]:
#Ta-da!

name_count

{'disney': 37139,
 'rides': 34956,
 'disneyland': 33569,
 'time': 29824,
 'day': 28923,
 'ride': 18038,
 'just': 16682,
 'great': 16635,
 'food': 14819,
 'kids': 14359,
 'place': 13564,
 'good': 13360,
 'really': 12436,
 'like': 11994,
 'visit': 11930,
 'went': 11381,
 'people': 11180,
 'long': 10373,
 'fast': 10146,
 'fun': 10127,
 'parks': 10050,
 'pass': 9904,
 'wait': 9840,
 'experience': 9047,
 'times': 8997,
 'days': 8843,
 'did': 8662,
 'parade': 8581,
 'world': 8212,
 'don': 8187,
 'year': 8174,
 'children': 8099,
 'staff': 8027,
 'characters': 7846,
 'mountain': 7683,
 'got': 7383,
 'line': 7316,
 'lines': 7236,
 'paris': 7197,
 'old': 7039,
 'family': 7025,
 'worth': 7013,
 'fireworks': 6913,
 'attractions': 6908,
 'going': 6877,
 'small': 6851,
 'little': 6828,
 'hotel': 6738,
 'queues': 6594,
 'best': 6585,
 'lot': 6580,
 'expensive': 6500,
 'loved': 6496,
 'trip': 6433,
 'make': 6415,
 'queue': 6319,
 'minutes': 6223,
 'didn': 6101,
 'amazing': 6076,
 'love': 5989,
 'bette