In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
idx=pd.IndexSlice
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
data = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='Review').sum()

In [None]:
#Check if there's only space in any review

blanks=[]
for i, r in enumerate(data.Review):
    if r.isspace():
        blanks.append(i)
        
blanks

In [None]:
# Part of speech tagging and analysis 

In [None]:
data.iloc[0]['Review']

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(data.iloc[0]['Review'])

In [None]:
for t in doc:
    print(f"{t.text:{13}}{t.pos_:{10}}{t.tag_:{10}}{spacy.explain(t.tag_)}") 

In [None]:
POS_count = doc.count_by(spacy.attrs.POS)

In [None]:
POS_count

In [None]:
for k, v in POS_count.items():
    print(f"{k}. {doc.vocab[k].text} {v}")

In [None]:
from spacy import displacy

In [None]:
options = {'distance':110, 'compact':'True', 'color':'yellow', 'bg':'#09a3d5', 'font':'Times'}

In [None]:
displacy.render(doc, style='dep', options=options)

In [None]:
#Labeling Each Rewview with NLTK

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
data['Scores'] = data['Review'].apply(lambda review: sid.polarity_scores(review))

In [None]:
data['Compound'] = data['Scores'].apply(lambda score:score['compound'])

In [None]:
data['Label'] = data['Compound'].apply(lambda score: 'POS' if score > 0 else 'NEG')

In [None]:
data.head()

In [None]:
#EDA 

In [None]:
data.drop('Scores', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
sns.histplot(data['Compound'], bins='auto')

In [None]:
sns.histplot(data[data['Label'] == 'POS']['Compound'], bins=20)
sns.histplot(data[data['Label'] == 'NEG']['Compound'], color='red', bins=20)
plt.legend(['POS', 'NEG'])

In [None]:
data['Label'].value_counts()

In [None]:
df = data['Label'].value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df['Label'],
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
#     shadow=True,
    explode=[0.1, 0]
        )

plt.title('Label Distribution in Data', fontsize=12);

In [None]:
#Avg Rating for each label

In [None]:
data.groupby('Label')['Rating'].mean()

In [None]:
df = data.groupby('Label')['Rating'].mean().reset_index()
sns.barplot(data=df, x='Label', y='Rating')
plt.axhline(data.Rating.mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Rating'])
plt.title('Avg Rating for each Label', fontsize=12)

In [None]:
# Avg Score for each label

In [None]:
data.groupby('Label')['Compound'].mean()

In [None]:
df = data.groupby('Label')['Compound'].mean().reset_index()
sns.barplot(data=df, x='Label', y='Compound')
plt.title('Avg Compound Score for each Label', fontsize=12)

In [None]:
# Avg Compund Score for each Rating

In [None]:
data.groupby('Rating')['Compound'].mean()

In [None]:
df = data.groupby('Rating')['Compound'].mean().reset_index()
sns.barplot(data=df, x='Rating', y='Compound')
plt.axhline(data.Compound.mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Compound Score'])
plt.title('Avg Compound Score for each Rating', fontsize=12);

In [None]:
df = data.groupby('Rating')['Compound'].mean().reset_index()
sns.lineplot(data=df, x='Rating', y='Compound', marker='o')
# plt.axhline(data.Compound.mean(), ls='--', color='black', alpha=0.3)
# plt.legend(['Avg Compound Score'])
plt.title('Avg Compound Score for each Rating', fontsize=12);

In [None]:
data.head()

In [None]:
data.groupby(['Rating', 'Label'])['Compound'].mean().unstack()

In [None]:
data.groupby(['Rating', 'Label'])['Compound'].mean().unstack().plot(kind='bar', rot=0)

In [None]:
#The sentiment Analyzer is still slightly biased as there are rating that equals 5 that still has a negative score
data.loc[(data['Label'] == 'NEG') & (data['Rating'] == 5)]

In [None]:
data.iloc[20170]['Review']

In [None]:
#Topic Modeling

In [None]:
data.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [None]:
dtm = tfidf.fit_transform(data['Review'])

In [None]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=10, random_state=42)

In [None]:
nmf_model.fit(dtm)

In [None]:
len(tfidf.get_feature_names())

In [None]:
import random

random_id = random.randint(0, 25268)

tfidf.get_feature_names()[random_id]

In [None]:
nmf_model.components_

In [None]:
nmf_model.components_[0].argsort()[-15:]

In [None]:
for index, topic in enumerate(nmf_model.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{index}:")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('')
    print('')

In [None]:
results = nmf_model.transform(dtm)

In [None]:
data['topic'] = results.argmax(axis=1)

In [None]:
data.head()

In [None]:
# Machine Learning Prediction on Label

In [None]:
from sklearn.model_selection import train_test_split

X = data['Review']
y = data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', LinearSVC())
])

In [None]:
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
print(classification_report(pred, y_test))

In [None]:
print(accuracy_score(pred, y_test))

In [None]:
print(confusion_matrix(pred, y_test))

In [None]:
data.head()

In [None]:
X = data.loc[:, ['Review', 'Rating', 'Compound']]

y = data.loc[:, ['Label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

In [None]:
# X_train_tfidf = tfidf.fit_transform(X_train['Review'])
# X_test_tfidf = tfidf.transform(X_test['Review'])

In [None]:
# X_train_final = hstack([
#     X_train_tfidf,
#     X_train.loc[:, ['Rating', 'Compound']].values
# ])

# X_test_final = hstack([
#     X_test_tfidf,
#     X_test.loc[:, ['Rating', 'Compound']].values
# ])

In [None]:
X_train_final = hstack([
    tfidf.fit_transform(X_train['Review']),# X_train_tfidf 
    X_train.loc[:, ['Rating', 'Compound']].values
])

X_test_final = hstack([
    tfidf.transform(X_test['Review']), # X_test_tfidf
    X_test.loc[:, ['Rating', 'Compound']].values
])

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [None]:
model.fit(X_train_final, y_train)

In [None]:
pred = model.predict(X_test_final)

In [None]:
print(classification_report(pred, y_test))

In [None]:
print(accuracy_score(pred, y_test))