## Sentiment Analysis - Word Clouds
- In this workbook our objective was to analyse reviews where a specific word was mentioned
- Each review was analysed for a specific word and flagged if that word had been mentioned in the review 
- For example if the word breakfast was mentioned, the review was flagged with a 1, so later those reviews can be grouped into a breakfast category
- Our overall aim was to look at what words were associated with specific food categories in a hotel review
- A word cloud was created for each review category, so we can observe the most frequently used words

#### Process
- 1) Clean reviews - Remove stop words, remove uppercase & remove non-letters
- 2) Create word categories - Create a function that flags if a word has been mentioned in a review
- 3) Filter each dataframe category - Here each negative and positive review was split into individual categories  
- 4) Create a wordcloud for each individual text category

In [1]:
import numpy as np
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import re
from collections import Counter
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import pandas as pd   
from bs4 import BeautifulSoup             
import re
import nltk
import wordcloud
from os import path, getcwd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
from nltk.tokenize import RegexpTokenizer
import uuid
from os.path import basename
from datetime import datetime

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\robert.lowe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
# Open negative and positive reviews
neg = pd.read_csv('Negative words only.csv')
pos= pd.read_csv('Positive words only.csv')

In [50]:
neg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122825 entries, 0 to 122824
Data columns (total 2 columns):
Rating             122825 non-null int64
Negative review    122799 non-null object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [55]:
neg[['Negative review']].head()

Unnamed: 0,Negative review
0,: o sofÃ¡ cama nÃ£o Ã© muito confortÃ¡vel.
1,: not really unhappy about anything!
2,: just to add pancakes at breakfast!!
3,: restaurant food prices way too expensive. ...
4,: booked a twin room but only had double whi...


In [4]:
# Change astype to string (so you can apply the clean_review function)
neg['Negative review'] = neg['Negative review'].astype(np.str)
pos['Positive'] = pos['Positive'].astype(np.str)

________________________________________________________________________________________________________________

### 1) Clean reviews - remove stopwords, remove uppercase & remove non-letters

________________________________________________________________________________________________________________

In [5]:
def clean_review( raw_review ):
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
# Apply clean_review function on each review
neg['Negative review'] = neg['Negative review'].apply(clean_review)
pos['Positive'] = pos.Positive.apply(clean_review)

In [8]:
neg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122825 entries, 0 to 122824
Data columns (total 2 columns):
Rating             122825 non-null int64
Negative review    122825 non-null object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [9]:
pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122825 entries, 0 to 122824
Data columns (total 2 columns):
Rating      122825 non-null int64
Positive    122825 non-null object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


__________________________________________________________________________________________________________________________________________


### 2) Create word categories
- Create a number of functions that flags if a word has been mentioned in a review
__________________________________________________________________________________________________________________________________________

In [10]:
def Food(str):
    words = str.split()
    counts = 0
    for word in words:
        if word == 'food':
            counts += 1
        else:
            counts == 0
    return counts 

In [11]:
def breakfast(str):
    words = str.split()
    counts = 0
    for word in words:
        if word == 'breakfast' or word == 'breakfasts':
            counts += 1
        else:
            counts == 0
    return counts 

In [12]:
def diet(str):
    words = str.split()
    counts = 0
    for word in words:
        if  word == 'vegetarians' or word == 'vegan'  or word == 'vegans' or word == 'vegetarian':
            counts += 1
        else:
            counts == 0
    return counts 

In [13]:
def buffet(str):
    words = str.split()
    counts = 0
    for word in words:
        if  word == 'buffet':
            counts += 1
        else:
            counts == 0
    return counts 

In [14]:
def restaurant(str):
    words = str.split()
    counts = 0
    for word in words:
        if  word == 'restaurant' or word =='restaurants':
            counts += 1
        else:
            counts == 0
    return counts 

In [15]:
def coffee_tea(str):
    words = str.split()
    counts = 0
    for word in words:
        if word =='coffee'or word =='tea':
            counts += 1
        else:
            counts == 0
    return counts 

In [16]:
def menu(str):
    words = str.split()
    counts = 0
    for word in words:
        if word =='menu':
            counts += 1
        else:
            counts == 0
    return counts 

In [17]:
def breakfast_components(str):
    words = str.split()
    counts = 0
    for word in words:
        if  word =='bacon'or word =='egg'or word =='toast'or word =='eggs'or word =='juice'or word =='sausages'or word =='scrambled'or word =='milk'or word =='fruit'or word =='croissants' or word =='beans'or word =='mushrooms'or word =='sausage'or word =='tomatoes':

            counts += 1
        else:
            counts == 0
    return counts 

In [18]:
def cooked(str):
    words = str.split()
    counts = 0
    for word in words:
        if   word =='cooked' or word ==  'uncooked':

            counts += 1
        else:
            counts == 0
    return counts 

In [19]:
def plates_cups(str):
    words = str.split()
    counts = 0
    for word in words:
        if  word =='plates'  or word =='cups':

            counts += 1
        else:
            counts == 0
    return counts 

In [20]:
def eating(str):
    words = str.split()
    counts = 0
    for word in words:
        
        if word =='eating':
            counts += 1
        else:
            counts == 0
    return counts 

In [21]:
def choices(str):
    words = str.split()
    counts = 0
    for word in words:
        if word =='choice' or word =='choices':
            counts += 1
        else:
            counts == 0
    return counts 

### Apply each function on the negative and positive reviews

In [22]:
neg['Neg_Food'] = neg['Negative review'].apply(Food)
neg['Neg_Choices'] = neg['Negative review'].apply(choices)
neg['Neg_eating'] = neg['Negative review'].apply(eating)
neg['Neg_plates_cups'] = neg['Negative review'].apply(plates_cups)
neg['Neg_cooked'] = neg['Negative review'].apply(cooked)
neg['Neg_breakfast_components'] = neg['Negative review'].apply(breakfast_components)
neg['Neg_menu'] = neg['Negative review'].apply(menu)
neg['Neg_coffee_tea'] = neg['Negative review'].apply(coffee_tea)
neg['Neg_restaurant'] = neg['Negative review'].apply(restaurant)
neg['Neg_buffet'] = neg['Negative review'].apply(buffet)
neg['Neg_diet'] = neg['Negative review'].apply(diet)
neg['Neg_breakfast'] = neg['Negative review'].apply(breakfast)

In [23]:
pos['Pos_Food'] = pos['Positive'].apply(Food)
pos['Pos_Choices'] = pos['Positive'].apply(choices)
pos['Pos_eating'] = pos['Positive'].apply(eating)
pos['Pos_plates_cups'] = pos['Positive'].apply(plates_cups)
pos['Pos_cooked'] = pos['Positive'].apply(cooked)
pos['Pos_breakfast_components'] = pos['Positive'].apply(breakfast_components)
pos['Pos_menu'] = pos['Positive'].apply(menu)
pos['Pos_coffee_tea'] = pos['Positive'].apply(coffee_tea)
pos['Pos_restaurant'] = pos['Positive'].apply(restaurant)
pos['Pos_buffet'] = pos['Positive'].apply(buffet)
pos['Pos_diet'] = pos['Positive'].apply(diet)
pos['Pos_breakfast'] = pos['Positive'].apply(breakfast)

____________________________________________________________________________________________________________________________________


#### For this specific case we only need a binary field, as we just want to look at the reviews where each category has been mentioned

____________________________________________________________________________________________________________________________________

In [25]:
pos.loc[:, 'Pos_Food': 'Pos_breakfast'] = np.where(((pos.loc[:, 'Pos_Food': 'Pos_breakfast'] >= 1)), 1, 0)
neg.loc[:, 'Neg_Food': 'Neg_breakfast'] = np.where(((neg.loc[:, 'Neg_Food': 'Neg_breakfast'] >= 1)), 1, 0)

In [27]:
# Rating was replaced to 0, so everything can be grouped to one item
pos.Rating = pos.Rating.replace(1,0)
neg.Rating = neg.Rating.replace(1,0)

In [28]:
pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122825 entries, 0 to 122824
Data columns (total 14 columns):
Rating                      122825 non-null int64
Positive                    122825 non-null object
Pos_Food                    122825 non-null int64
Pos_Choices                 122825 non-null int64
Pos_eating                  122825 non-null int64
Pos_plates_cups             122825 non-null int64
Pos_cooked                  122825 non-null int64
Pos_breakfast_components    122825 non-null int64
Pos_menu                    122825 non-null int64
Pos_coffee_tea              122825 non-null int64
Pos_restaurant              122825 non-null int64
Pos_buffet                  122825 non-null int64
Pos_diet                    122825 non-null int64
Pos_breakfast               122825 non-null int64
dtypes: int64(13), object(1)
memory usage: 13.1+ MB


In [29]:
pos.Positive = pos.Positive.astype(np.str)

________________________________________________________________________________________________________________________

### 3) Filter each dataframe category

________________________________________________________________________________________________________________________

In [30]:
# filter data to only reviews which have been flagged for that category
# E.g. Pos_Food  == 1 will just be reviews that people have mentioned the word food in a positive review 

Pos_Food = pos[pos['Pos_Food'] == 1]
Pos_Choices = pos[pos['Pos_Choices'] == 1]
Pos_eating = pos[pos['Pos_eating'] == 1]
Pos_plates_cups  = pos[pos['Pos_plates_cups'] == 1]
Pos_cooked = pos[pos['Pos_cooked'] == 1]
Pos_breakfast_components = pos[pos['Pos_breakfast_components'] == 1]
Pos_menu = pos[pos['Pos_menu'] == 1]
Pos_coffee_tea = pos[pos['Pos_coffee_tea'] == 1]
Pos_restaurant  = pos[pos['Pos_restaurant'] == 1]
Pos_buffet = pos[pos['Pos_buffet'] == 1]
Pos_diet = pos[pos['Pos_diet'] == 1]
Pos_breakfast = pos[pos['Pos_breakfast'] == 1]

In [31]:
# join all text together so we can create a wordcloud for the whole review for each category
def pos_text(df):
    
    """ A function that joins all text into one column"""
    
    pos = df.groupby(['Rating'])['Positive'].apply(' '.join)
    pos = pos.reset_index()
    return pos['Positive']

In [54]:
# example for the above function
pos_text(Pos_Food)

0    positive location perfect mins walk high st sh...
Name: Positive, dtype: object

In [32]:
# function to create an excel file with multiple sheets
def dfs_tabs(df_list, sheet_list, file_name):
    
    '''function to create an excel file with multiple sheets '''
    
    writer = pd.ExcelWriter(file_name,engine='xlsxwriter')   

    for dataframe, sheet in zip(df_list, sheet_list):
        dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0)   
        
    writer.save()

In [33]:
#List the dataframes variabes 
dfs = [Pos_Food, Pos_Choices,Pos_eating, Pos_plates_cups,  Pos_cooked ,Pos_breakfast_components ,Pos_menu ,
 Pos_coffee_tea ,Pos_restaurant, Pos_buffet ,Pos_diet,Pos_breakfast]

#name for each sheet of the excel file
names = ['Pos_Food', 'Pos_Choices','Pos_eating', 'Pos_plates_cups',  'Pos_cooked' ,'Pos_breakfast_components' ,'Pos_menu' ,
 'Pos_coffee_tea' ,'Pos_restaurant', 'Pos_buffet' ,'Pos_diet','Pos_breakfast']

#List comprehension to create a joined text for each category dataframe
dfs = [pos_text(x) for x in dfs]

#Perform the multiple excel sheet function on the selected dataframes
dfs_tabs(dfs, names, 'pos_food_reviews_2.xlsx')

________________________________________________________________________________


#### repeat for all negative review categories


________________________________________________________________________________

In [38]:
Neg_Food = neg[neg['Neg_Food'] == 1]
Neg_Choices = neg[neg['Neg_Choices'] == 1]
Neg_eating = neg[neg['Neg_eating'] == 1]
Neg_plates_cups  = neg[neg['Neg_plates_cups'] == 1]
Neg_cooked = neg[neg['Neg_cooked'] == 1]
Neg_breakfast_components = neg[neg['Neg_breakfast_components'] == 1]
Neg_menu = neg[neg['Neg_menu'] == 1]
Neg_coffee_tea = neg[neg['Neg_coffee_tea'] == 1]
Neg_restaurant  = neg[neg['Neg_restaurant'] == 1]
Neg_buffet = neg[neg['Neg_buffet'] == 1]
Neg_diet = neg[neg['Neg_diet'] == 1]
Neg_breakfast = neg[neg['Neg_breakfast'] == 1]


In [39]:
def Neg_text(df):
    neg = df.groupby(['Rating'])['Negative review'].apply(' '.join)

    neg = neg.reset_index()
    return neg['Negative review']

In [40]:
# function to create an excel file with multiple sheets
def dfs_tabs(df_list, sheet_list, file_name):
    
    '''function to create an excel file with multiple sheets '''
    
    writer = pd.ExcelWriter(file_name,engine='xlsxwriter')   

    for dataframe, sheet in zip(df_list, sheet_list):
        dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0)   
        
    writer.save()

In [41]:
#List the Independent variabes (this can also be used as a list to name each sheet)
dfs = [Neg_Food, Neg_Choices,Neg_eating, Neg_plates_cups,  Neg_cooked ,Neg_breakfast_components ,Neg_menu ,
 Neg_coffee_tea ,Neg_restaurant, Neg_buffet ,Neg_diet,Neg_breakfast]

names = ['Neg_Food', 'Neg_Choices','Neg_eating', 'Neg_plates_cups',  'Neg_cooked' ,'Neg_breakfast_components' ,'Neg_menu' ,
 'Neg_coffee_tea' ,'Neg_restaurant', 'Neg_buffet' ,'Neg_diet','Neg_breakfast']

#List comprehension to create a list of dataframes created from the Kruskal function 
dfs = [Neg_text(x) for x in dfs]

#Perform the multiple excel sheet function on the selected dataframes
dfs_tabs(dfs, names, 'neg_food_reviews_2.xlsx')

________________________________________________________________________________

#### 4) Create a wordcloud for each individual text category

________________________________________________________________________________

In [42]:
def plot_word_cloud(text, icon):

    #image saved - icon is the saved png image (normally a clipart image from google)
    icon_path = "%s.png" % icon

    # Shape
    icon = Image.open(icon_path).convert("RGBA")
    mask = Image.new("RGB", icon.size, (255,255,255))
    mask.paste(icon,icon)
    mask = np.array(mask)

    # Word Cloud with colored mask
    wc = WordCloud(background_color="white", max_words=200, mask=mask, repeat=False,
               max_font_size=300, stopwords=["holiday", "inn", 'positive', 'hotel', 'room', 'great', 'excellent', 'good', 'nice', 'would', 'lovely', 'comfy'])

    coloring = np.array(Image.open(icon_path))
    image_colors = ImageColorGenerator(coloring)
    image_colors.default_color = [0.6,0.6,0.6]

    # Generate a word cloud
    wc.generate(text)
    
    #save wordcloud - a random string is created for each saved wordcloud
    save = "%s.png" % (str(uuid.uuid4())) 
    wc.to_file(save)
    
    #Visualize the word cloud
    return wc.to_image()

In [36]:
# Similar to function above - creates text field for each image and creates a wordcloud 
def word_cloud_neg(df):
    
    """ A function to join all text columns and create a word cloud """
    
    neg = df.groupby(['Rating'])['Negative review'].apply(' '.join)
    neg = neg.reset_index()
    neg  = neg.iloc[0, 1]
    return plot_word_cloud(neg, 'green-circle-hi')

In [46]:
# List of category dataframes
dfs = [Neg_Food, Neg_Choices,Neg_eating, Neg_plates_cups,  Neg_cooked ,Neg_breakfast_components ,Neg_menu ,
 Neg_coffee_tea ,Neg_restaurant, Neg_buffet ,Neg_diet,Neg_breakfast]

# List comprehension to create a word cloud for each category dataframe
wordclouds = [word_cloud_neg(x) for x in dfs]

In [43]:
# Repeat process for positive reviews 
def word_cloud(df):
    pos = df.groupby(['Rating'])['Positive'].apply(' '.join)
    pos = pos.reset_index()
    pos  = pos.iloc[0, 1]
    return plot_word_cloud_jpg(pos, 'green-circle-hi')
    

In [44]:
# List of positive category dataframes
dfs = [Pos_Food, Pos_Choices,Pos_eating, Pos_plates_cups,  Pos_cooked ,Pos_breakfast_components ,Pos_menu ,
 Pos_coffee_tea ,Pos_restaurant, Pos_buffet ,Pos_diet,Pos_breakfast]

# List comprehension to create a word cloud for each positive category dataframe
wordclouds = [word_cloud(x) for x in dfs]