## Word Count
- In this workbook a word count dataframe is created, for english words only and words that are a noun. 

__Process__ 

- 1) Clean review
- 2) Use the collections library Counter to count each word in the review
- 3) Use nltk library word_tokenize to list a speach tag for each word (In this case we want nouns only)
- 4) Remove Spanish and French stopwords, and filter out non relavent speach tags and french/spanish stopwords
- 5) Check if each word falls in category
- 6) Remove non-english words

In [1]:
from nltk.corpus import stopwords 
import pandas as pd   
from bs4 import BeautifulSoup             
import re
import nltk

In [2]:
import numpy as np
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import re
import numpy as np
import pandas as pd
import networkx as nx
from collections import Counter
from collections import defaultdict
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv('Food_review_with_masterlist.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120882 entries, 0 to 120881
Data columns (total 55 columns):
Unnamed: 0         120882 non-null int64
Hotel Code         120882 non-null object
review             120882 non-null object
sentiment          120882 non-null int64
FAC_ID             120882 non-null int64
Brand              120882 non-null object
Hotel Type         120882 non-null object
GroupBy            120882 non-null int64
Rating             120882 non-null int64
Positive           108674 non-null object
Negative review    120856 non-null object
Neg_Food           120882 non-null int64
Neg_Choices        120882 non-null int64
Neg_eating         120882 non-null int64
Neg_plates_cups    120882 non-null int64
Neg_cooked         120882 non-null int64
Neg_menu           120882 non-null int64
Neg_coffee_tea     120882 non-null int64
Neg_restaurant     120882 non-null int64
Neg_buffet         120882 non-null int64
Neg_diet           120882 non-null int64
Neg_breakfast      120

In [5]:
df[['Positive', 'Negative review']].head()

Unnamed: 0,Positive,Negative review
0,"positive: localização em frente ao metrô, bom...",: o sofá cama não é muito confortável.
1,positive: the reception staff were exceptiona...,: i had to repeatedly ask for my room to be m...
2,positive: great layout and location,: all good
3,positive: location was ok,: the room was noisy and very uncomnfortable
4,positive: accogliente e ben organizzata,: la stanza era veramente piccola


In [6]:
df['Positive'] = df['Positive'].astype(np.str)
df['Negative review'] = df['Negative review'].astype(np.str)

In [7]:
pos = df.groupby(['GroupBy'])['Positive'].apply(' '.join)
neg = df.groupby(['GroupBy'])['Negative review'].apply(' '.join)

In [8]:
pos = pos.reset_index()
neg = neg.reset_index()

In [9]:
neg = neg.drop(['GroupBy'], axis = 1)
pos = pos.drop(['GroupBy'], axis = 1)
pos.head()

Unnamed: 0,Positive
0,"positive: localização em frente ao metrô, bom..."


____

#### 1) Clean review - remove stop words and non-letters
___

In [10]:
def clean_review( raw_review ):
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [11]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\robert.lowe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
pos['Clean__review'] = pos.Positive.apply(clean_review)
neg['Clean__review'] = neg['Negative review'].apply(clean_review)

____

#### 2) Word Count

 - Count each word in a review for both positive and negative reviews
___

In [13]:
from collections import Counter
pos['Clean__review'] = pos['Clean__review'].apply(lambda x: Counter(str(x).split(' ')))
neg['Clean__review'] = neg['Clean__review'].apply(lambda x: Counter(str(x).split(' ')))

In [14]:
pos.head()

Unnamed: 0,Positive,Clean__review
0,"positive: localização em frente ao metrô, bom...","{'positive': 107789, 'localiza': 136, 'em': 43..."


In [19]:
df['Clean_review'].to_clipboard()

____

#### Note

- Split Clean_review (dictionary) into a dataframe consisting of word and Count
___

In [39]:
df_neg = neg.iloc[0, -1]
df_neg = pd.DataFrame.from_dict(df_neg, orient='index').reset_index()


In [40]:
df_neg = df_neg.sort_values(by = 0, ascending=False)
df_neg.columns = [['Negative_Words', 'Count_neg']]
df_neg = df_neg.reset_index()

In [52]:
df_neg.to_clipboard()
df_neg.to_csv('df_neg.csv')

In [42]:
df_pos = pos.iloc[0, -1]
df_pos = pd.DataFrame.from_dict(df_pos, orient='index').reset_index()
df_pos = df_pos.sort_values(0, ascending=False)
df_pos.columns = [['Positive_Words', 'Count_pos']]
df_pos = df_pos.reset_index()

In [53]:
df_pos.to_csv('df_pos.csv')

____

#### Note

- Save positive and negative dataframe (had issues with refering to column names) 
___

In [43]:
import pandas as pd
import numpy as np

In [81]:
df_pos = pd.read_csv('df_pos.csv')
df_neg = pd.read_csv('df_neg.csv')

In [82]:
df_pos.head()

Unnamed: 0.1,Unnamed: 0,index,Positive_Words,Count_pos
0,0,0,positive,107789
1,1,51,staff,37537
2,2,73,room,28472
3,3,70,good,28373
4,4,165,breakfast,26184


In [45]:
df_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34263 entries, 0 to 34262
Data columns (total 4 columns):
Unnamed: 0        34263 non-null int64
index             34263 non-null int64
Positive_Words    34262 non-null object
Count_pos         34263 non-null int64
dtypes: int64(3), object(1)
memory usage: 1.0+ MB


In [46]:
df_pos['Positive_Words'] = df_pos['Positive_Words'].astype(np.str)
df_neg['Negative_Words'] = df_neg['Negative_Words'].astype(np.str)

____

#### Clean

- remove digits from words 
___

In [47]:
from string import digits


def remove_digits(x):
    
    
    remove_digits = str.maketrans('', '', digits)
    x = x.translate(remove_digits)
    return str(x)

In [48]:
df_pos['Positive_Words'] = df_pos['Positive_Words'].apply(remove_digits)
df_neg['Negative_Words'] = df_neg['Negative_Words'].apply(remove_digits)

In [49]:
df_pos.head()

Unnamed: 0.1,Unnamed: 0,index,Positive_Words,Count_pos
0,0,0,positive,107789
1,1,51,staff,37537
2,2,73,room,28472
3,3,70,good,28373
4,4,165,breakfast,26184


____

#### 3) Use nltk library word_tokenize to list a speach tag for each word (In this case we want nouns only)

- apply to negative and positive words 
___

In [50]:
import nltk
def tokens(x):
    tokens = nltk.word_tokenize(x)
    tagged = nltk.pos_tag(tokens)
    lst = ','.join(str(v) for v in tagged)
    lst = lst.replace('(', '').replace(')', '')
    lst = lst.split(',')
    lst = lst[-1] 
    return lst.replace("'", '').replace(' ', '')


In [51]:
df_neg['Tokens'] = df_neg['Negative_Words'].apply(tokens)
df_pos['Tokens'] = df_pos['Positive_Words'].apply(tokens)

In [52]:
df_neg.head()

Unnamed: 0.1,Unnamed: 0,index,Negative_Words,Count_neg,Tokens
0,0,8,room,34011,NN
1,1,49,breakfast,19173,NN
2,2,526,nothing,14256,NN
3,3,96,hotel,13862,NN
4,4,17,staff,10091,NN


In [53]:
from stop_words import get_stop_words
spanstop_words = get_stop_words('spanish')

____

#### 4) Remove Spanish and French stopwords, and filter out non relavent speach tags and french/spanish stopwords
___

In [54]:
def remove_span_words(x):
    spanstop_words = get_stop_words('spanish')
    if x in spanstop_words:
        return 1
    else:
        return 0
remove_span_words('hello')

0

In [55]:
def remove_fren_words(x):
    frenstop_words = get_stop_words('french')
    if x in frenstop_words:
        return 1
    else:
        return 0

remove_fren_words('et')

1

In [56]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /Users/bobbylowe/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [57]:
df_neg['spw'] = df_neg['Negative_Words'].apply(remove_span_words)
df_pos['spw'] = df_pos['Positive_Words'].apply(remove_span_words)

In [58]:
df_neg['fnw'] = df_neg['Negative_Words'].apply(remove_fren_words)
df_pos['fnw'] = df_pos['Positive_Words'].apply(remove_fren_words)

In [59]:
df_pos

Unnamed: 0.1,Unnamed: 0,index,Positive_Words,Count_pos,Tokens,spw,fnw
0,0,0,positive,107789,JJ,0,0
1,1,51,staff,37537,NN,0,0
2,2,73,room,28472,NN,0,0
3,3,70,good,28373,JJ,0,0
4,4,165,breakfast,26184,NN,0,0
5,5,57,location,22895,NN,0,0
6,6,154,clean,19982,NN,0,0
7,7,53,friendly,18826,RB,0,0
8,8,17,hotel,16984,NN,0,0
9,9,55,great,16737,JJ,0,0


____

#### Filter Enlish words and Nouns
___

In [60]:
df_neg = df_neg[df_neg['spw'] == 0]
df_neg = df_neg[df_neg['fnw'] == 0]
df_neg = df_neg[df_neg['Tokens'] != "JJ"]
df_neg = df_neg[df_neg['Tokens'] != "JJR"]
df_neg = df_neg[df_neg['Tokens'] != "JJS"]
df_neg = df_neg[df_neg['Tokens'] != "RB"]
df_neg = df_neg[df_neg['Tokens'] != "CD"]
df_neg = df_neg[df_neg['Tokens'] != "MD"]
df_neg = df_neg[df_neg['Tokens'] != "PRP"]
df_neg = df_neg[df_neg['Tokens'] != "VB"]
df_neg = df_neg[df_neg['Tokens'] != "VBD"]
df_neg = df_neg[df_neg['Tokens'] != "CC"]
df_neg = df_neg[df_neg['Tokens'] != "VBP"]
df_neg = df_neg[df_neg['Tokens'] != "VBG"]
df_neg = df_neg[df_neg['Tokens'] != "VBZ"]
df_neg = df_neg[df_neg['Tokens'] != "VBN"]
df_neg = df_neg[df_neg['Tokens'] != "VBZ"]
df_neg = df_neg[df_neg['Tokens'] != "DT"]
df_neg = df_neg[df_neg['Tokens'] != "IN"]
df_neg = df_neg[df_neg['Tokens'] != "TD"]
df_neg = df_neg[df_neg['Tokens'] != "RBR"]
df_neg = df_neg[df_neg['Tokens'] != "WDT"]
df_neg = df_neg[df_neg['Tokens'] != "WP$"]

In [61]:
df_pos = df_pos[df_pos['spw'] ==0]
df_pos = df_pos[df_pos['fnw'] == 0]
df_pos = df_pos[df_pos['Tokens'] != "JJ"]
df_pos = df_pos[df_pos['Tokens'] != "JJR"]
df_pos = df_pos[df_pos['Tokens'] != "JJS"]
df_pos = df_pos[df_pos['Tokens'] != "RB"]
df_pos = df_pos[df_pos['Tokens'] != "CD"]
df_pos = df_pos[df_pos['Tokens'] != "MD"]
df_pos = df_pos[df_pos['Tokens'] != "PRP"]
df_pos = df_pos[df_pos['Tokens'] != "VB"]
df_pos = df_pos[df_pos['Tokens'] != "VBD"]
df_pos = df_pos[df_pos['Tokens'] != "CC"]
df_pos = df_pos[df_pos['Tokens'] != "VBP"]
df_pos = df_pos[df_pos['Tokens'] != "VBG"]
df_pos = df_pos[df_pos['Tokens'] != "VBZ"]
df_pos = df_pos[df_pos['Tokens'] != "VBN"]
df_pos = df_pos[df_pos['Tokens'] != "VBZ"]
df_pos = df_pos[df_pos['Tokens'] != "DT"]
df_pos = df_pos[df_pos['Tokens'] != "IN"]
df_pos = df_pos[df_pos['Tokens'] != "TD"]
df_pos = df_pos[df_pos['Tokens'] != "RBR"]
df_pos = df_pos[df_pos['Tokens'] != "WDT"]
df_pos = df_pos[df_pos['Tokens'] != "WP$"]

In [62]:
df_neg = df_neg.drop(['spw', 'fnw'], axis = 1)
df_pos = df_pos.drop(['spw', 'fnw'], axis = 1)

In [63]:
df_pos.to_clipboard()

In [64]:
#problems with joining
df_neg.to_clipboard()

In [65]:
df_pos.head()

Unnamed: 0.1,Unnamed: 0,index,Positive_Words,Count_pos,Tokens
1,1,51,staff,37537,NN
2,2,73,room,28472,NN
4,4,165,breakfast,26184,NN
5,5,57,location,22895,NN
6,6,154,clean,19982,NN


____

#### 5) Check if each word falls in a category
___

In [66]:
def WORD(word):
    if word == 'clean' or word == 'dirty' or word == 'cleanliness'or word == 'tidy' or word == 'spotless' or word == 'cleaned' or word == 'filthy' or word == 'housekeeping':        
        return 'Cleanliness'
    
    elif word == 'location' or word == 'miles'  or word == 'station' or word == 'distance' or word == 'train' or word == 'airport' or word ==  'journey'or word =='motorway'or word =='tube'or word =='located'or word =='terminal'or word =='drive'or word =='places' or word == 'city' or word == 'walk' or word == 'convenient':
 
        return 'Location'
    
    elif word == 'staff' or word == 'service' or word == 'concierge' or word == 'friendly' or word == 'welcoming' or word == 'helpful' or word =='polite' or word == 'complaints'or word =='accommodating'or word =='rude'or word =='manager'or word =='personnel'or word =='rude':
        
        return 'Staff'
    elif word == 'lifts' or word == 'lift' or word == 'warm' or word == 'cold' or word == 'temperature' or word == 'refurbished' or  word =='building'or word =='alarm'or word =='corridor'or word =='corridors'or word =='maintenance':
        return 'Unclassified'
    
    elif word == 'check-in' or word == 'check-out' or word == 'check' or word == 'reception'or word =='desk'or word =='arrival'or word =='receptionist'or word =='entrance'or word =='lobby' or word == 'welcome' or word =='checked'or word =='checkout'or word =='checkin':

        return 'Arrival Experience'
    
    elif word == 'spa' or word == 'pool' or word == 'lounge' or word == 'bar' or word == 'parking' or word == 'facilities' or  word =='park'or  word =='bars'or word =='gym'or word =='swimming':

        return 'Facilities'
    
    elif word == 'food' or word == 'breakfast' or word == 'restaurant' or word == 'buffet' or word == 'vegetarians' or word == 'vegan'  or word == 'vegans' or word == 'vegetarian'or word == 'meals' or word =='restaurants'or word =='drinks'or word =='coffee'or word =='tea'or word =='dinner'or word =='menu'or word =='bacon'or word =='egg'or word =='toast'or word =='drink'or word =='eggs'or word =='juice'or word =='eat'or word =='sausages'or word =='dining'or word =='scrambled'or word =='milk'or word =='fruit'or word =='breakfasts' or word =='cooked'or word =='meal'or word =='choices'or word =='croissants'or word =='plates'or word =='continental'or word =='beans'or word =='mushrooms'or word =='eating'or word =='sausage'or word =='tomatoes'or word =='cups':


        return 'Food'
    
    elif word == 'membership' or word == 'club' or word == 'loyalty' or word == 'benefits'or word == 'member' or word == 'rewards' or word == 'reward'  or word == 'tiers' or word == 'elite' or word == 'spire' or word == 'gold' or word == 'silver':
        
        return 'Loyality'
    
    elif word == 'value' or word == 'money' or word == 'priced' or word== 'pices'  or word =='price'or word =='pay'or word =='paid'or word =='pricey'or word =='cheap'or word =='overpriced'or word =='pounds'or word == '£':

        return 'Value'
    
    elif word == 'room' or word == 'rooms' or word == 'size'or word == 'view'or word == 'noisy'or word == 'loud'or word == 'zimmer':

         
        return 'Room'
        
    elif word == 'bath' or word== 'sink' or word == 'soap'or word == 'bathroom' or word == 'shampoo' or word == 'toilet' or word == 'toiletries' or word == 'toothbrush' or word == 'toothpaste' or word == 'toothbrushes' or word == 'toilets' or word == 'shower'  or word == 'towels': 


        return 'Bathroom'
    
    elif word == 'pillow' or word == 'comfy' or word == 'single' or word == 'bedding' or word == 'bed' or word == 'duvet' or word == 'sheets' or word == 'sheet'  or word == 'comfortable' or word == 'pillows'  or word == 'beds'  or word == 'duvets'  or word == 'beds' or word == 'duvets' or word == 'towel' or word == 'sheets'or word == 'sheet' or word == 'comfortable' or word == 'pillow' or word == 'pillows' or   word =='sleep'or word =='bedroom'or word =='king'or word =='twin'or word =='mattress'or word =='comfort'or word =='slept'or word =='relaxing':

        return 'Bed'
    
    elif word == 'wardrobe' or word == 'sofa' or word == 'table' or word == 'tables' or word == 'furniture' or word == 'carpet' or word == 'rug'  or word == 'lamp' or word == 'lighting'  or word == 'lights'  or word == 'blinds' or word == 'curtains' or word == 'wardrobes' or word == 'sofas' or word == 'carpets' or word == 'rugs'  or word == 'lamps'  or word == 'blind'  or word == 'curtain' or word == 'chair'or word == 'mirror'or word == 'cupboard':

        
        return 'Furniture'
    
    elif word == 'iron' or word == 'hairdryer'or word == 'mini bar'or word == 'kettle'or word == 'aircon'or word == 'conditioning' or word == 'aircon' or word == 'heating' or word == 'radiator' or word == 'radiators' or word =='air'or word =='drink'or word =='fridge' or word =='light'or word =='phone'or word =='fan'or word =='windows'or word =='window' or word == 'minibar'or word == 'water'or word == 'air'or word == 'drink'or word == 'fridge'or word == 'ac'or word == 'light'or word == 'phone'or word == 'fan'or word == 'windows'or word == 'window':
        return 'Room_facilities'
    
    elif word == 'safety' or word == 'safe'  or word == 'security'or word == 'secure':
        
        return 'Safety'
    
    elif word == 'movie' or word == 'movies' or word == 'channels' or  word == 'television' or  word == 'pay-per-view' or  word == 'tv' or word == 'remote'or word =='batteries'or word =='battery':

        
        return 'Room_Entertainment'
    
    elif word == 'internet' or word == 'wifi'  or word == 'free-wifi'or word == 'connection'or word == 'connectivity'or word == 'card'or word == 'power'or word == 'payment' or word == 'usb':
        
        return 'Technology'
    

    elif word == 'noise' or word =='guests'or word =='decorated'or word =='customer'or word =='charge'or word =='booking'or word =='booked'or word =='book':
    
        return 'Other'

    else:
        return 0 

In [67]:
# apply each function to the review column
df_pos['Pos_Sentiment_analysis_category'] = df_pos['Positive_Words'].apply(WORD)
df_neg['Neg_Sentiment_analysis_category'] = df_neg['Negative_Words'].apply(WORD)



In [68]:
df_pos.to_clipboard()

____

#### 6) English words only
___

In [69]:
import enchant
d = enchant.Dict("en_UK")
d.check("Hello")

True

In [70]:
def english_words(x):
    d = enchant.Dict("en_UK")
    return d.check(x)

In [71]:
#df_neg['English_Positive_Words'] = df_neg['Negative_Words'].apply(english_words_only)
df_pos['English_Positive_Words'] = df_pos['Positive_Words'].apply(english_words)

In [72]:
df_pos = df_pos.replace(np.nan, False)

In [73]:
df_pos = df_pos[df_pos['English_Positive_Words'] == True]
df_pos

Unnamed: 0.1,Unnamed: 0,index,Positive_Words,Count_pos,Tokens,Pos_Sentiment_analysis_category,English_Positive_Words
1,1,51,staff,37537,NN,Staff,True
2,2,73,room,28472,NN,Room,True
4,4,165,breakfast,26184,NN,Food,True
5,5,57,location,22895,NN,Location,True
6,6,154,clean,19982,NN,Cleanliness,True
8,8,17,hotel,16984,NN,0,True
11,11,54,helpful,13713,NN,Staff,True
12,12,163,bed,13394,NN,Bed,True
15,15,69,excellent,11117,NN,0,True
17,17,164,comfy,7888,NN,Bed,True


In [74]:
df_pos = df_pos.drop(['Unnamed: 0', 'index', 'Tokens', 'English_Positive_Words'], axis = 1)

In [75]:
df_neg['English_Negative_Words'] = df_neg['Negative_Words'].apply(english_words)
df_neg = df_neg.replace(np.nan,False)
df_neg = df_neg[df_neg['English_Negative_Words'] == True]
df_neg

Unnamed: 0.1,Unnamed: 0,index,Negative_Words,Count_neg,Tokens,Neg_Sentiment_analysis_category,English_Negative_Words
0,0,8,room,34011,NN,Room,True
1,1,49,breakfast,19173,NN,Food,True
2,2,526,nothing,14256,NN,0,True
3,3,96,hotel,13862,NN,0,True
4,4,17,staff,10091,NN,Staff,True
5,5,59,bed,9932,NN,Bed,True
8,8,172,night,7064,NN,0,True
9,9,43,bit,6792,NN,0,True
11,11,173,parking,6683,NN,Facilities,True
13,13,1429,food,6454,NN,Food,True


In [76]:
df_neg = df_neg.drop(['Unnamed: 0', 'index', 'Tokens', 'English_Negative_Words'], axis = 1)

In [77]:
df_neg.head()

Unnamed: 0,Negative_Words,Count_neg,Neg_Sentiment_analysis_category
0,room,34011,Room
1,breakfast,19173,Food
2,nothing,14256,0
3,hotel,13862,0
4,staff,10091,Staff


In [78]:
df_pos.head()

Unnamed: 0,Positive_Words,Count_pos,Pos_Sentiment_analysis_category
1,staff,37537,Staff
2,room,28472,Room
4,breakfast,26184,Food
5,location,22895,Location
6,clean,19982,Cleanliness


In [79]:
df_neg.to_csv('Negative_english_noun_words.csv')

In [80]:
df_pos.to_csv('Positive_english_noun_words.csv')