In [None]:
import pandas as pd
import numpy as np
import nltk

In [None]:
data = pd.read_csv('515K hotel dataset.csv')

In [None]:
#Missing value: Drop the reviews with missing value directly.
data.dropna(inplace = True)
data.drop('Unnamed: 0', 1, inplace =True)
data.reset_index(drop = True, inplace = True)

In [None]:
#One-side Reviews: Remove the review with "No Negative"/"No Positive". 
data['NegativeReview'].replace('No Negative', "", inplace = True)
data['PositiveReview'].replace('No Positive', "", inplace = True)

In [None]:
#Combine: Positive and negative reviews would be treated as only a review, and in addition : lower the case. 
corpus = data.NegativeReview + data.PositiveReview
data.insert(0, "Review", corpus.str.lower())

In [None]:
#欲刪除的標點符號
import re
remove1 = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'  

for i in range(len(data['Review'])):
    print(i)
    data['Review'][i] = re.sub(remove1,"",data['Review'][i])

In [None]:
#Tokenization: Conduct the work_tokenize first. (sent_tokenizing is more complicated in this case.)
from nltk.tokenize import sent_tokenize, word_tokenize  
word_tokenized = data.Review.apply(word_tokenize)
data.insert(0,"WordToken", word_tokenized)

In [None]:
#Anomaly, Weird records: Drop the empty review(or we could remove the review with less than 5 words?)
word_count = data.WordToken.apply(lambda x: len(x))
filter_count = (word_count >= 1)
data = data[filter_count]
data.reset_index(drop = True, inplace = True)

In [None]:
#Lemmatization: Convert the terms with different representations into the original.
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
def lemmatize_text(text):
    return [wnl.lemmatize(w) for w in text]

data.insert(0, 'Lemmatized',data.WordToken.apply(lemmatize_text))

In [None]:
#StopWordRemoval: Remove the NLTK build-in stopwords in all the records.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wosw = data.Lemmatized.apply(lambda x:  [item for item in x if item not in stop_words] )
data.insert(0, 'preprocessing_finished',wosw)

In [None]:
# data.to_csv('hotelreviews.csv',sep=',')
df = pd.read_csv('hotelreviews.csv')

In [None]:
df.drop('Unnamed: 0', 1, inplace =True)
df

In [None]:
#BOW TFIDF with unigram/ uni-bigram
def dum(doc):
    return doc
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', 
                             ngram_range = (1,1),     #多1%   
                             tokenizer = dum, 
                             preprocessor = dum,
                             min_df = 5000)   #5000
x = vectorizer.fit_transform(df.preprocessing_finished)
vec = x.toarray()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x_ = vec
lbl_ = le.fit_transform(df['TripStyle'])   # 0 for Business, 1 for Leisure trip

#Split them into train/test set, randomly with the test size 0.33 
from sklearn.model_selection import train_test_split
x_train, x_test, lbl_train, lbl_test = train_test_split(x_, lbl_ , test_size = 0.33, shuffle = True)

In [None]:
print(x_train.shape)
print(lbl_train.shape)

In [None]:
lbl_train2 = lbl_train
lbl_test2 = lbl_test
print(lbl_train2.shape)
print(lbl_test2.shape)

In [None]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(x_train, lbl_train)

In [None]:
#logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, lbl_train)

In [None]:
#DNN
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.utils import np_utils  # 用來後續將 label 標籤轉為 one-hot-encoding 
# 建立簡單的線性執行的模型
model = Sequential()
# Add Input layer, 隱藏層(hidden layer) 有 256個輸出變數
model.add(Dense(units=256, input_dim=300, kernel_initializer='normal', activation='relu')) 
# Add output layer
model.add(Dense(units=2, kernel_initializer='normal', activation='softmax'))
print(model.summary())
# 編譯: 選擇損失函數、優化方法及成效衡量方式
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
lbl_train = np_utils.to_categorical(lbl_train) 
lbl_test = np_utils.to_categorical(lbl_test)
# 進行訓練
model.fit(x=x_train, y=lbl_train, validation_split=0.2, epochs=10, batch_size=64, verbose=2)
scores = model.evaluate(x_test, lbl_test)
print('test loss:', scores[0])
print('test accuracy:', scores[1])

In [None]:
#Evaluation function:
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, accuracy_score, classification_report
import imblearn
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot as plt

#ref: https://acutecaretesting.org/en/articles/precision-recall-curves-what-are-they-and-how-are-they-used
def evaluating(test, pred, ax=object):
  
    print('accuracy:',accuracy_score(test, pred))
    print('\n')
    print( classification_report_imbalanced(test, pred))    
    print('\n')
    print ( confusion_matrix(test, pred))
    precision, recall, threshold = precision_recall_curve(test, pred)

    ax.step(recall, precision, color='b', alpha=1, where='post')
    ax.fill_between(recall, precision, step='post', alpha=0.5, color='b')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_ylim([0.0, 1.05])
    ax.set_xlim([0.0, 1.0])
    ax.set_title('Precision-Recall curve')
    return ax

In [None]:
# predict for test set(NB)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
pred_gnb = clf_gnb.predict(x_test)
evaluating(lbl_test, pred_gnb, ax1)

In [None]:
# predict classes for test set(LogisticR)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
y_pred = lr.predict(x_test)
evaluating(lbl_test, y_pred, ax1)

In [None]:
# predict classes for test set(DNN model)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
pred = model.predict_classes(x_test)
evaluating(lbl_test2, pred, ax1)