In [1]:
#Import libraries:
import pandas as pd
import numpy as np

import re

import nltk
nltk.download('punkt')
nltk.download('wordnet') # wordnet is the most well known lemmatizer for english
nltk.download('stopwords')

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\beatr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\beatr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\beatr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Upload and Analysis data

In [31]:
#Upload data:

data = pd.read_csv("C:\\Users\\beatr\\Documents\\Beatriz\\Ironhack\\Projects\\NLP_DisneylandReviews\DisneylandReviews.csv", encoding='latin-1')

In [32]:
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [None]:
#data = data.sample(n = 10000)

In [None]:
# Data Analysis:

len(data)

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data["Rating"].value_counts()

In [None]:
data.groupby("Branch")["Rating"].count()

In [None]:
data.groupby(["Branch","Rating"])["Rating"].count()

In [None]:
data["Rating"].describe()

In [None]:
data["Rating"].value_counts(normalize=True)

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(data['Rating'])
plt.title('Ratings Count in the dataset',fontsize=15)
plt.xlabel('Rating',fontsize=8)
plt.ylabel('Count',fontsize=8)

# Treating the Reviews column

In [4]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    
    final = s.lower()
    #print(final)
    
    final = re.sub("http:\S+", " ", final)
    #print(final)
    
    final = re.findall("[a-z]+", final)
    #print(final)
    
    return ' '.join(final)

In [5]:
data["Reviews_clean"] = data["Review_Text"].apply(clean_up)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,if you ve ever been to disneyland anywhere you...
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,its been a while since d last time we visit hk...
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,thanks god it wasn t too hot or too humid when...
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,hk disneyland is a great compact park unfortun...
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,the location is not in the city took around ho...


In [6]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    
    return nltk.word_tokenize(s)

In [7]:
data["Reviews_clean"] = data["Reviews_clean"].apply(tokenize)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,"[if, you, ve, ever, been, to, disneyland, anyw..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,"[its, been, a, while, since, d, last, time, we..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,"[thanks, god, it, wasn, t, too, hot, or, too, ..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,"[hk, disneyland, is, a, great, compact, park, ..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,"[the, location, is, not, in, the, city, took, ..."


In [8]:
def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in l]
    
    lemmatizer = WordNetLemmatizer() 
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    return lemmatized

In [9]:
data["Reviews_clean"] = data["Reviews_clean"].apply(stem_and_lemmatize)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,"[if, you, ve, ever, been, to, disneyland, anyw..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,"[it, been, a, while, sinc, d, last, time, we, ..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,"[thank, god, it, wasn, t, too, hot, or, too, h..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,"[hk, disneyland, is, a, great, compact, park, ..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,"[the, locat, is, not, in, the, citi, took, aro..."


In [10]:
variable = stopwords.words("english")

def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    removing = [w for w in l if not w in variable]
    return removing

In [11]:
data["Reviews_clean"] = data["Reviews_clean"].apply(remove_stopwords)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,"[ever, disneyland, anywher, find, disneyland, ..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,"[sinc, last, time, visit, hk, disneyland, yet,..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,"[thank, god, hot, humid, wa, visit, park, othe..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,"[hk, disneyland, great, compact, park, unfortu..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,"[locat, citi, took, around, hour, kowlon, kid,..."


In [12]:
def re_blob(row):
    return " ".join(row['Reviews_clean'])

In [13]:
data['Reviews_clean'] = data.apply(re_blob, axis=1)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,ever disneyland anywher find disneyland hong k...
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,sinc last time visit hk disneyland yet thi tim...
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,thank god hot humid wa visit park otherwis wou...
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,hk disneyland great compact park unfortun quit...
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,locat citi took around hour kowlon kid like di...


In [None]:
#from nltk.sentiment import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

#sia = SentimentIntensityAnalyzer()

#def is_positive(review):
#    if sia.polarity_scores(review)["compound"] > 0: # when the compound score is greater than 0 the reviewer is positive
#        return 1 # 1 for positive
#    return 0 # 0 for negative or neutral

#data["sentiment"] = data["Reviews_clean"].apply(is_positive)

In [None]:
#data["sentiment"].value_counts()

In [None]:
#def intensity(review):
#    return (abs(sia.polarity_scores(review)["compound"])+1)**2 # to give more importance the intense reviews we are going to square it

#data['intensity'] = data["Reviews_clean"].apply(intensity)

In [None]:
#data["intensity"].value_counts()

In [37]:
variable = stopwords.words("english")

def clean_review(review):
    
    review_clean = review.lower()
   
    review_clean = re.sub("http:\S+", " ", review_clean)
    
    review_clean = re.findall("[a-z]+", review_clean)
    
    ps = PorterStemmer()
    stemmed = [ps.stem(w) for w in nltk.word_tokenize(' '.join(review_clean))]
    
    lemmatizer = WordNetLemmatizer() 
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    row = [word for word in lemmatized if not word in variable]
   
    return " ".join(row) 

In [38]:
data["Reviews_clean"] = data["Review_Text"].apply(clean_review)
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,Reviews_clean
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,ever disneyland anywher find disneyland hong k...
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,sinc last time visit hk disneyland yet thi tim...
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,thank god hot humid wa visit park otherwis wou...
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,hk disneyland great compact park unfortun quit...
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,locat citi took around hour kowlon kid like di...


# Bag of words

In [39]:
#Creating a Bag of words:

list_words = []

for row in data["Reviews_clean"]:
    for word in row:
        list_words.append(word)

list_words = nltk.FreqDist(list_words)
#print(list_words)

top_words = list(list_words.keys())[:5000]
#print(top_words)

In [40]:
#let's take only the most common 1000 words
bow_vect = CountVectorizer(max_features = 1000)

# fit creates one entry for each different word seen  
x = bow_vect.fit_transform(data['Reviews_clean']).toarray()

df = pd.DataFrame(x, columns = bow_vect.get_feature_names())
df.shape

df.head()



Unnamed: 0,abl,abov,absolut,accept,access,accommod,across,activ,actual,ad,...,worth,would,wow,wrong,ye,year,yet,young,younger,yr
0,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
df.to_csv("C:\\Users\\beatr\\Documents\\Beatriz\\Ironhack\\Projects\\NLP_DisneylandReviews\df.csv")

In [None]:
#df["sentiment_number"] = data["sentiment"]
#df["intensity_number"] = data["intensity"]

In [44]:
y = data["Rating"].copy()

In [None]:
def wordCloud_generator(data, title=None):
    wordcloud = WordCloud(width = 800, height = 800,
                          background_color ='black',
                          min_font_size = 10
                         ).generate(" ".join(data.values))                      
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=30)
    plt.show()
    
wordCloud_generator(data['Review_Text'], title="Top words in reviews")

In [None]:
def wordCloud_generator(data, title=None):
    wordcloud = WordCloud(width = 800, height = 800,
                          background_color ='black',
                          min_font_size = 10
                         ).generate(" ".join(data.values))                      
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=30)
    plt.show()
    
wordCloud_generator(data['Reviews_clean'], title="Top words in reviews after clean")

# LogisticRegression Model

In [45]:
# Instantiating a LogisticRegression Model (this is classification)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0, max_iter=10000)

# Splitting the datasets into training and testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df, y, train_size = 0.8, random_state = 0)

# Fitting our model
model.fit(x_train, y_train)

predicted = model.predict(x_test)

# evaluate (y_test == predicted)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted)

0.6166197843413034

In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicted)

array([[ 113,   67,   52,   11,   38],
       [  79,  106,  149,   50,   65],
       [  47,  113,  334,  231,  333],
       [  21,   28,  202,  568, 1335],
       [   5,   16,   79,  350, 4140]], dtype=int64)

In [47]:
from sklearn.metrics import r2_score
r2_score(y_test, predicted)

0.3136426212588598

# RandomForestClassifier

In [None]:
# Instantiating a RandomForest

from sklearn.ensemble import RandomForestClassifier

# Split the data

x_train, x_test, y_train, y_test = train_test_split(df, y, train_size = 0.8, random_state = 0)

# define models
forest = RandomForestClassifier(random_state=0)

forest.fit(x_train, y_train)

predicted = forest.predict(x_test)

accuracy_score(y_test, predicted)

In [None]:
confusion_matrix(y_test, predicted)

In [None]:
r2_score(y_test, predicted)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train, y_train)

predicted = nb.predict(x_test)

accuracy_score(y_test, predicted)

In [None]:
confusion_matrix(y_test, predicted)

In [None]:
r2_score(y_test, predicted)

# xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
#https://xgboost.readthedocs.io/en/latest/parameter.html

x_train, x_test, y_train, y_test = train_test_split(df, y, train_size = 0.8, random_state = 0)

# specify parameters via map
param = {
    'booster': 'gbtree'
    ,'max_depth': 3
    ,'learning_rate': 0.3
    ,'subsample': 0.5
    ,'sample_type': 'uniform'
    #,'objective': 'binary:hinge'
    #,'obejective:'binary:logistic'
    ,'rate_drop': 0.0
    ,'n_estimators': 2000
    ,'verbosity':3
    #,'nthread': 5
}

d_train = xgb.DMatrix(x_train, y_train)
d_test = xgb.DMatrix(x_test, y_test)

clf = xgb.train(param, d_train)

# make prediction
preds = clf.predict(d_test)

# print accuracy score
r2_score(y_test, preds)

# Get a new text review 

In [48]:
dataframe_newreview = pd.DataFrame(columns = ['Review_Text'])

dataframe_newreview['Review_Text'] = [input("Can you give us your review?")]

Can you give us your review?i loved the park it was amazing


In [49]:
dataframe_newreview["Reviews_clean"] = dataframe_newreview['Review_Text'].apply(clean_review)

dataframe_newreview.drop(["Review_Text"], axis = 1)

Unnamed: 0,Reviews_clean
0,love park wa amaz


In [None]:
dataframe_newreview["Reviews_clean"] = dataframe_newreview['Review_Text'].apply(clean_up)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(tokenize)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(stem_and_lemmatize)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(remove_stopwords)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview.apply(re_blob, axis = 1)
dataframe_newreview.head()

dataframe_newreview.drop(["Review_Text"], axis = 1)

In [50]:
#get the columns and one row of our bag of words

new_review = df.iloc[0]
new_review.values[:] = 0

#from our new review split and add as a list
list_new_review = dataframe_newreview["Reviews_clean"].str.split(" ")

#interate over the new review to get the frequency of each word
for word in list_new_review[0]:
    if word in list(new_review.index):
        new_review[word] = +1

#transform the new review into a dataframe and reset the index
new_review = pd.DataFrame(new_review).T
#new_review.reset_index()

#Predict the review using our model
model.predict(new_review)

array([5], dtype=int64)

In [51]:
new_review

Unnamed: 0,abl,abov,absolut,accept,access,accommod,across,activ,actual,ad,...,worth,would,wow,wrong,ye,year,yet,young,younger,yr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Get a speech review

In [54]:
import pyaudio
import wave
import speech_recognition as sr

In [55]:
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

* recording
range(0, 430)
* done recording


In [56]:
filename = "output.wav"

# initialize the recognizer
r = sr.Recognizer()

# open the file
with sr.AudioFile(filename) as source:
    # listen for the data (load audio to memory)
    audio_data = r.record(source)
    # recognize (convert from speech to text)
    text = r.recognize_google(audio_data)
    print(text)

it was an amazing experience I love you it was super cool


In [57]:
text

'it was an amazing experience I love you it was super cool'

In [58]:
dataframe_newreview = pd.DataFrame(columns = ['Review_Text'])

dataframe_newreview['Review_Text'] = [text]

In [59]:
dataframe_newreview["Reviews_clean"] = dataframe_newreview['Review_Text'].apply(clean_review)

dataframe_newreview.drop(["Review_Text"], axis = 1)

Unnamed: 0,Reviews_clean
0,wa amaz experi love wa super cool


In [None]:
dataframe_newreview["Reviews_clean"] = dataframe_newreview['Review_Text'].apply(clean_up)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(tokenize)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(stem_and_lemmatize)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview["Reviews_clean"].apply(remove_stopwords)
dataframe_newreview.head()

dataframe_newreview["Reviews_clean"] = dataframe_newreview.apply(re_blob, axis = 1)
dataframe_newreview.head()

dataframe_newreview.drop(["Review_Text"], axis = 1)

In [60]:
#get the columns and one row of our bag of words

new_review = df.iloc[0]
new_review.values[:] = 0

#from our new review split and add as a list
list_new_review = dataframe_newreview["Reviews_clean"].str.split(" ")

#interate over the new review to get the frequency of each word
for word in list_new_review[0]:
    if word in list(new_review.index):
        new_review[word] = +1

#transform the new review into a dataframe and reset the index
new_review = pd.DataFrame(new_review).T
#new_review.reset_index()

#Predict the review using our model
model.predict(new_review)

array([5], dtype=int64)

# Pickle

In [None]:
# Save all the necessary elements in files to share with a user

import pickle

# save the model to disk
filename = 'Model_for_review.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(x_test, y_test)
#print(result)