In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import collections
import time

In [None]:
# loads selected columns into variable hotel_reviews and changes their data types
hotel_reviews = pd.read_csv("Hotel_Reviews.csv", 
                 index_col=False,
                 usecols=['Hotel_Address',
                          'Additional_Number_of_Scoring',
                          'Average_Score',
                          'Hotel_Name',
                          'Reviewer_Nationality',
                          'Negative_Review',
                          'Positive_Review',
                          'Reviewer_Score',
                          'Total_Number_of_Reviews_Reviewer_Has_Given', 
                          'Tags'],
                 dtype={'Hotel_Address':'string',
                          'Additional_Number_of_Scoring':'float',
                          'Average_Score':'float',
                          'Hotel_Name':'string',
                          'Reviewer_Nationality':'string',
                          'Negative_Review':'string',
                          'Positive_Review':'string',
                          'Reviewer_Score':'float',
                          'Total_Number_of_Reviews_Reviewer_Has_Given':'int', 
                          'Tags':'string'})

In [None]:
# saves data without duplicate rows into variable df
df = hotel_reviews.drop_duplicates()

# replaces default review answer No Negative and No Positive to blank string
df.Negative_Review = df.Negative_Review.replace('No Negative', '')
df.Positive_Review = df.Positive_Review.replace('No Positive', '')

# rounds score to 0 decimal places
df = df.round({'Reviewer_Score':0})
df = df.round({'Average_Score':0})

In [None]:
# prints basic information about cleaned dataframe
print(df.shape)
print(df.describe())
print(df.info())

In [None]:
# histograms of columns Reviewer_Score and Average_Score
ax = sns.countplot(data = df, x = 'Reviewer_Score')
ax = sns.countplot(data = df, x = 'Average_Score')

# heatmap for reviewer and average score
sns.heatmap(df.pivot_table(index='Reviewer_Score', columns='Average_Score'))

In [None]:
# GOOGLE API
from google.cloud import language_v1
from google.oauth2 import service_account

In [None]:
# authentication
creds = service_account.Credentials.from_service_account_file('./credentials.json')
client = language_v1.LanguageServiceClient(credentials=creds)

In [None]:
# number of dataframe rows to work with
df_subset = 10000

In [None]:
# goes through df_subset rows and saves category of positive review for each row to array
positive_review_category_array = []
pos_time_start = time.time()
for i in range(df_subset):
    text_content = df['Positive_Review'].values[i]
    # text must consist of at least 20 words
    if len(df['Positive_Review'].values[i].split()) > 20:
        def sample_classify_text(text_content):
            type_ = language_v1.Document.Type.PLAIN_TEXT
            language = "en"
            document = {"content": text_content, "type_": type_, "language": language}
            response = client.classify_text(request = {'document': document})
            return response
        
        response = sample_classify_text(text_content)
        for category in response.categories:
            positive_review_category_array.append(str(category.name))
    if i%10 == 0:
        print("positive precessed:", i, "/", df_subset, "=", i*100/df_subset,"%", round(time.time() - pos_time_start), "seconds")

In [None]:
# splits categories by / delimiter
positive_review_category_array_clean = [word for line in positive_review_category_array for word in line.split('/')]
# filters out blank strings from array
positive_review_category_array_clean = [n for n in positive_review_category_array_clean if n != '']
# creates dataframe of unique categories and their amount
df_pos = pd.DataFrame.from_dict(collections.Counter(np.array(positive_review_category_array_clean)), orient='index').reset_index()
# sorts by most frequent categories and filters first 20 rows
df_pos.sort_values(by=0,ascending = False).head(20)
# bar chart representation
df_pos_chart = sns.barplot(data = df_pos.sort_values(by=0,ascending = False).head(20), y = 'index', x = 0).set_title('Top 20 categories of positive reviews')

In [None]:
# goes through df_subset rows and saves category of negative review for each row to array
negative_review_category_array = []
neg_time_start = time.time()
for i in range(df_subset):
    text_content = df['Negative_Review'].values[i]
    if len(df['Negative_Review'].values[i].split()) > 20:
        def sample_classify_text(text_content):
            type_ = language_v1.Document.Type.PLAIN_TEXT
            language = "en"
            document = {"content": df['Negative_Review'].values[i], "type_": type_, "language": language}
            response = client.classify_text(request = {'document': document})
            return response
        
        response = sample_classify_text(df['Negative_Review'].values[i])
        for category in response.categories:
            negative_review_category_array.append(str(category.name))
    if i%10 == 0:
        print("negative precessed:", i, "/", df_subset, "=", i*100/df_subset,"%", round(time.time() - neg_time_start), "seconds")

In [None]:
# splits categories by / delimiter        
negative_review_category_array_clean = [word for line in negative_review_category_array for word in line.split('/')]
# filters out blank strings from array
negative_review_category_array_clean = [n for n in negative_review_category_array_clean if n != '']
# creates dataframe of unique categories and their amount
df_neg = pd.DataFrame.from_dict(collections.Counter(np.array(negative_review_category_array_clean)), orient='index').reset_index()
# sorts by most frequent categories and filters first 20 rows
df_neg.sort_values(by=0,ascending = False).head(20)
# bar chart representation
df_neg_chart = sns.barplot(data = df_neg.sort_values(by=0,ascending = False).head(20), y = 'index', x = 0).set_title('Top 20 categories of negative reviews')

In [None]:
#druha cast
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
from collections import Counter
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_md")

In [None]:
soubor = pd.read_csv('C:/Users/zbyne/Downloads/hotel_reviews.csv')
len(soubor) #515738
soubor.head(5)
#Latitude and longitude = Zeměpisná šířka a zeměpisná délka

In [None]:
soubor.info()

In [None]:
#chybí nějaké hodnoty?
soubor.isna().sum()/(len(soubor)) #ano, ale ty beztak nebudeme používat, takže to nebudeme řešit.

In [None]:
#počet hotelů v souboru
len(pd.unique(soubor['Hotel_Address']))

In [None]:
#základní přehled
soubor.groupby(['Hotel_Name']).mean()

In [None]:
#rozložení skóre recenzí, né uplně pěkný.
df = soubor['Reviewer_Score']
sns.countplot(x="Reviewer_Score",data=soubor,palette="RdYlGn")

In [None]:
#pokus o rozdělení hodnot na základě reviewer score dle hranice  0-6 negativní ...
soubor["P/N"] = pd.cut(soubor["Reviewer_Score"], 
                   bins=[0,6,10], 
                   labels=["Negative","Positive"],right=True)
sns.countplot(x="P/N",data=soubor,palette="RdYlGn")

In [None]:
subset = soubor[['Reviewer_Nationality','P/N']]
subset.groupby("P/N").count()

In [None]:
stat = soubor['Hotel_Address']

In [None]:
#extrahovani statu z adres hotelů a následné přidání států do df
stat = soubor['Hotel_Address']
staty = []

for i in range(0,len(soubor)):
    x = [token.text for token in nlp(stat[i])][-1]
    if x == "Kingdom":
        x = 'United Kingdom'
    staty.append(x)

soubor['stat'] = staty
print('Transformation was successful.')

In [None]:
#kontrola
pd.unique(soubor['stat'])

In [None]:
#Nejčastější podstatná jména v negativních recenzích na vzorku ze souboru; lepší/rozumnější by bylo použít pro vzorek funkci sample(n=...)
vzorek = soubor[0:9000]

vzorek_negative_review_list = list(vzorek['Negative_Review'])
delimiter = ','
final_str = delimiter.join(map(str, vzorek_negative_review_list))
doc = nlp(final_str)
nouns = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == 'NOUN']
word_freq = Counter(nouns)
word_freq.most_common(10)

In [None]:
#Nejčastější slovesa v negativních recenzích na vzorku prvnich 9000 ze souboru
vzorek = soubor[0:9000]

vzorek_negative_review_list = list(vzorek['Negative_Review'])
delimiter = ','
final_str = delimiter.join(map(str, vzorek_negative_review_list))
doc = nlp(final_str)
verbs = [token.lemma_ for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == 'VERB']
word_freq = Counter(verbs)
word_freq.most_common(10)

In [None]:
#přídavná jména spojena s nějčastějším podst. jménem "ROOM"(vycházejíc ze skriptu předtím) v negativních recenzí na souboru

pattern = [{'POS': 'ADJ'},{'TEXT': 'room'}]
list1 = []
matcher = Matcher(nlp.vocab)
matcher.add('ADJ_NOUN_PATTERN', None, pattern)

for i in range(0,len(soubor)):
    doc = nlp(soubor.iloc[i,6])
    matches = matcher(doc)
    for match_id, start, end in matches:
        list1.append(doc[start:end].text)


In [None]:
#vytvoření funkce pro zobrazení top 5
from collections import Counter
def most_frequent(List):
    List = [x.lower() for x in List]
    occurence_count = Counter(List)
    return occurence_count.most_common(5)


In [None]:
##přídavná jména spojena s nějčastějším podst. jménem "ROOM"(vycházejíc ze skriptu předtím) v negativních recenzí na souboru výsledek
print(most_frequent(list1))

In [None]:
#přídavná jména spojena s nějčastějšími podst. jménem "Breakfast" v negativních recenzí na souboru

pattern = [{'POS': 'ADJ'},{'TEXT': 'breakfast'}]
list_breakfast = []
matcher = Matcher(nlp.vocab)
matcher.add('ADJ_NOUN_PATTERN', None, pattern)

for i in range(0,len(soubor)):
    doc = nlp(soubor.iloc[i,6])
    matches = matcher(doc)
    for match_id, start, end in matches:
        list_breakfast.append(doc[start:end].text)
        
print(most_frequent(list_breakfast))

In [None]:
#relativní počty recenzí k hotelům, resp. kolik recenzí se stahovalo k jaké zemi
import seaborn as sns
rel_l = []
rel_ln = ['United Kingdom','Spain','France','Netherlands','Austria','Italy']
for i in list(soubor['stat'].value_counts()):
    rel = i/len(soubor)
    rel_l.append(rel)
temp = pd.DataFrame()
temp['stát'] = rel_ln
temp ['rel_pocty'] = rel_l
temp
sns.barplot(y='rel_pocty',x="stát",data=temp,palette="rocket")

In [None]:
#nejčastější národnost hostů - 3
sns.countplot(x='Reviewer_Nationality',data=soubor,palette="rocket",order=soubor.Reviewer_Nationality.value_counts().iloc[:3].index)

In [None]:
#absolutní počty nejčastějších národností
soubor['Reviewer_Nationality'].value_counts()

In [None]:
#kontingenční tabulka
mask = soubor['Reviewer_Nationality'].isin([' United Kingdom ',' United States of America ', ' Australia ',' Ireland ',' United Arab Emirates ',' Slovakia ', ' Czech Republic '])
table = soubor[mask].pivot_table(
        values='Hotel_Address', 
        index='Reviewer_Nationality',
        columns='stat', 
        aggfunc='count', 
        margins=True
    )
table