In [None]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pylab as plt
import seaborn as sns
from wordcloud import WordCloud

# Warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import nltk
nltk.download()

In [None]:
#Loading the dataset
hotel=pd.read_excel('hotel_reviews.xlsx')

In [None]:
hotel

In [None]:
hotel.head()

In [None]:
#Null Values
hotel.info()

In [None]:
hotel.describe()

In [None]:
hotel.isnull().sum()

In [None]:
hotel.nunique()

In [None]:
#duplicated values
hotel.duplicated().value_counts()


In [None]:
hotel.index

In [None]:
hotel["Rating"].value_counts()

# Exploratory Data Analysis(EDA)

In [None]:

plt.figure(figsize=(8,6))
plt.title('Count of Ratings')
ax = sns.countplot(data = hotel, x='Rating', edgecolor='black', linewidth=1, palette = "Set1")
for i in ax.containers:
    ax.bar_label(i, label_type='center')
plt.show()

In [None]:
# Most of the rating are in 4 and 5

In [None]:
plt.figure(figsize=(15,7))
plt.pie(hotel['Rating'].value_counts(), labels = ['Rating_5','Rating_4','Rating_3','Rating_2','Rating_1'], autopct='%.0f%%');

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(hotel['Rating'], bins=20, kde=True)
plt.title('Rating Distribution Across Dataset')
plt.xlabel('User Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Custom Function to divide the rating into two class problem
def map_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    else:
        return 'Positive'

In [None]:
# sentiment mapped into 2 class 1=Positive reviews 0=Negtaive reviews
hotel['Sentiment'] = [map_sentiment(x) for x in hotel['Rating']]
hotel.head()

In [None]:
# 1 = Positive; 0 = Negative
sns.countplot(hotel['Sentiment']).set(title="Count of Reviews by Sentiment");

In [None]:
fig = plt.figure(figsize=(14,7))
hotel["Length"] = hotel.Review.str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(hotel[hotel['Rating']==5]['Length'], ax=ax1,color='green')
fig.suptitle('Distribution of text length for 5 Star Rating', fontsize=16)
display(hotel.Length[hotel.Rating==5].describe())

In [None]:
fig2 = plt.figure(figsize=(14,8))
ax2 = fig2.add_subplot(122)
sns.histplot(hotel[hotel["Rating"]==1]["Length"],ax=ax2,color='red')
fig2.suptitle("Distribution of text length for 1 Star Rating",fontsize=16)
display(hotel.Length[hotel.Rating==1].describe())

# Visualization



# Most Used Words

In [None]:
import plotly.express as px

In [None]:
# Review
mostUsedWords = hotel['Review'].str.split(expand=True).stack().value_counts()
mostUsedWords_top200 = mostUsedWords[:200]

fig = px.treemap(mostUsedWords_top200, path=[mostUsedWords_top200.index], values=0)
fig.update_layout(title_text='The 200 Most Frequest Words',
                  title_x=0.5, title_font=dict(size=20)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

In [None]:
# Cleaned Review
mostUsedWords = hotel['Cleaned_Review'].str.split(expand=True).stack().value_counts()
mostUsedWords_top200 = mostUsedWords[:200]

fig = px.treemap(mostUsedWords_top200, path=[mostUsedWords_top200.index], values=0)
fig.update_layout(title_text='The 200 Most Frequest Words',
                  title_x=0.5, title_font=dict(size=20)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

# Word Cloud

In [None]:
from wordcloud import WordCloud
import os

In [None]:
pip install --upgrade pyodbc

In [None]:

pip install --upgrade pip

In [None]:
pip install --upgrade pillow


# **WordCloud for Negative Rating**:

In [None]:
plt.figure(figsize=(20,20))
Neg_wc = WordCloud(max_words=2000, min_font_size=10,
                height=800,width=1600,background_color="black").generate(" ".join(hotel[hotel["Rating"]<3].Review))
plt.imshow(Neg_wc)

# **WordCloud for Neutral Rating**

In [None]:
plt.figure(figsize=(20,20))
Neutral_wc = WordCloud(max_words=2000, min_font_size=10,
                height=800,width=1600,background_color="skyblue").generate(" ".join(hotel[hotel["Rating"]==3].Review))
plt.imshow(Neutral_wc)

# **WordCloud for Positive Rating**

In [None]:
plt.figure(figsize=(20,20))
Pos_wc = WordCloud(max_words=2000, min_font_size=10,
                height=800,width=1600,background_color="white").generate(" ".join(hotel[hotel["Rating"]>3].Review))
plt.imshow(Pos_wc)

In [None]:
pip install --upgrade numpy==1.22.0

In [None]:
pip install --upgrade nltk

In [None]:
nltk.download('punkt')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Lemmatizer

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Lemmatization
wordnet = WordNetLemmatizer()

# Stop word
stop_words = stopwords.words('english')


In [None]:
# Varibale created for words which are not included in the stopwords
not_stopwords = ("aren", "aren't", "couldn", "couldn't", "didn", "didn't",
                 "doesn", "doesn't", "don", "don't", "hadn", "hadn't", "hasn",
                 "hasn't", "haven", "haven't", "isn", "isn't", "mustn",
                 "mustn't", "no", "not", "only", "shouldn", "shouldn't",
                 "should've", "wasn", "wasn't", "weren", "weren't", "will",
                 "wouldn", "wouldn't", "won't", "very")
stop_words_ = [words for words in stop_words if words not in not_stopwords]

# Additional words added in the stop word list
# stop_words_.append("I")
# stop_words_.append("the")
stop_words_.append("n")
print(stop_words_)

In [None]:
corpus = []
for i in range(0, len(hotel)):

    # Removal of puntuations
    review = re.sub('[^a-zA-Z0-9*]', ' ', hotel['Review'][i])

    # Converting Text to Lower case
    review = review.lower()

    # Spliting each words - eg ['I','was','happy']
    review = review.split()



    # Removal of stop words
    review = [word for word in review if word not in stop_words_]

    # Joining the words in sentences
    review = ' '.join(review)
    corpus.append(review)


In [None]:
hotel['Cleaned_Review'] = corpus

In [None]:
# remove both the leading and the trailing characters
review = [x.strip() for x in hotel.Cleaned_Review]

# removes empty strings, because they are considered in Python as False
hotel['Cleaned_Review'] = [x for x in review if x]

In [None]:
hotel

In [None]:
# Finding total words in cleaned review
clean_review_words = " ".join(hotel['Cleaned_Review'])
clean_review_words = clean_review_words.split()

# total number of words present in cleaned reviews
len(clean_review_words)

In [None]:
# finding frequency of all words
all_words = len(clean_review_words)
print('Total words in clean review is {}'.format(all_words))

word_dist = nltk.FreqDist(clean_review_words)

# Creating dataframe for word frequency
freq_data_all = pd.DataFrame(word_dist.most_common(all_words),columns=['Word', 'Frequency'])

# Percentage of word frequency
freq_data_all['percentFreq_all'] = (freq_data_all.Frequency / freq_data_all.Frequency.sum()) * 100

print('No of unique words {}'.format(len(freq_data_all)))
print(freq_data_all[0:10])

In [None]:
# words which are used only once
(freq_data_all['Frequency']==1).value_counts()

# Term Frequency with Count Vetorizer

In [None]:
# Stop word
stop_words_keywords = stopwords.words('english')

# special additioanl stop words added for keyword extraction
stop_words_keywords.extend(["will", "always", "go", "one", "very", "good", "only", "mr", "lot", "two",
                            "th", "etc", "don", "due", "didn", "since", "nt", "ms", "ok", "almost",
                            "put", "pm", "hyatt", "grand", "till", "add", "let", "hotel", "able",
                            "per", "st", "couldn", "yet", "par", "hi", "well", "would", "I", "the",
                            "s", "also", "great", "get", "like", "take", "thank"
                            ])

# unigram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# find the most frequent words in the data, extracting information about its content and topics.
stops = set(stopwords.words('english') + ['com'])
co = CountVectorizer(stop_words=stop_words_keywords)
counts = co.fit_transform(corpus)
most_freq_terms = pd.DataFrame(counts.sum(axis=0), columns=co.get_feature_names_out()).T.sort_values(0, ascending=False).head(25)


In [None]:
most_freq_terms.plot(kind='bar', title='Unigram Frequency', figsize=(15, 10))

# BI-Gram

In [None]:
# Now we can check for frequent bi-grams:
co = CountVectorizer(ngram_range=(2, 2), stop_words=stop_words_keywords)
counts1 = co.fit_transform(corpus)
bi_grams = pd.DataFrame(counts1.sum(axis=0), columns=co.get_feature_names_out()).T.sort_values(0, ascending=False).head(25)


In [None]:
bi_grams.plot(kind='bar', title='bi-grams', figsize=(13, 10))

# Tri-Grams

In [None]:
## Now we can check for frequent tri-grams:
co = CountVectorizer(ngram_range=(3, 3), stop_words=stop_words_keywords)
counts1 = co.fit_transform(corpus)
tri_grams = pd.DataFrame(counts1.sum(axis=0), columns=co.get_feature_names_out()).T.sort_values(0, ascending=False).head(25)


In [None]:
tri_grams.plot(kind='bar', title='tri-grams', figsize=(13, 10))

# Term Frequency with TFIDF

# unigram

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf2 = TfidfVectorizer(norm="l2", analyzer='word', stop_words=stop_words_keywords, ngram_range=(1, 1))
tfidf2_x = tfidf2.fit_transform(corpus)
most_freq_terms = pd.DataFrame(tfidf2_x.sum(axis=0), columns=tfidf2.get_feature_names_out()).T.sort_values(0, ascending=False).head(25)


In [None]:

most_freq_terms.plot(kind='bar',title='most frequent terms & their frequency',figsize=(15, 10))

# BI-Grams

In [None]:
# Now we can check for frequent bi-grams:

tfidf2 = TfidfVectorizer(norm="l2",analyzer='word',stop_words=stop_words_keywords,ngram_range=(2, 2))
tfidf2_x = tfidf2.fit_transform(corpus)
most_freq_terms = pd.DataFrame(tfidf2_x.sum(axis=0),columns=tfidf2.get_feature_names_out()).T.sort_values(0,ascending=False).head(25)

In [None]:
plt.rcParams.update({'font.size': 22})
most_freq_terms.plot(kind='bar', title='Bi-Gram', figsize=(15, 10))

# Tri-Grams

In [None]:
# Now we can check for frequent tri-grams:

tfidf2 = TfidfVectorizer(norm="l2",analyzer='word',stop_words=stop_words_keywords,ngram_range=(3, 3))
tfidf2_x = tfidf2.fit_transform(corpus)
most_freq_terms = pd.DataFrame(tfidf2_x.sum(axis=0),columns=tfidf2.get_feature_names_out()).T.sort_values(0,ascending=False).head(25)


In [None]:
plt.rcParams.update({'font.size': 22})
most_freq_terms.plot(kind='bar', title='Tri-Gram', figsize=(15, 10))