In [52]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import csv
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
import itertools

In [53]:
df = pd.read_csv('/input/hotels_reviews.csv')
df.shape
print(df.label.value_counts())
print(df.head())

1    12207
0     3225
Name: label, dtype: int64
        city                                            content      country  \
0  Amsterdam  What can I say... ,Everything was just fantast...  Netherlands   
1  Amsterdam  My wife and I stayed at Ambassade Hotel for 7 ...  Netherlands   
2  Amsterdam  We started our Europe 2019 trip in Amsterdam a...  Netherlands   
3  Amsterdam  If you get a chance to visit the French Restau...  Netherlands   
4  Amsterdam  This is a charming small hotel in a central lo...  Netherlands   

   rating                                              title       titleHotel  \
0       5                               Absolutely wonderful  Ambassade Hotel   
1       5                 Fabulous First Visit to Amsterdam   Ambassade Hotel   
2       5  Fantastic Ambassade Hotel - Location, Restaura...  Ambassade Hotel   
3       5                               Wonderful Restaurant  Ambassade Hotel   
4       5                                    Will stay again  Amba

# Data Preprocessing

In [54]:
df = df[df['rating'] != 3]   # drop the reviews which rating is 3  

#label added all reviews
def get_rating(rating):    
    rating = int(rating)
    if rating > 3:
        return '1'
    else:
        return '0'

df['label'] = df['rating'].apply(get_rating)
# check duplicate value
#duplicates = df.duplicated()
#print(duplicates.value_counts()) 
#checking missing value
#print(df.isnull().any().sum())  


df['content'] = df.content.apply(lambda x : x.lower())  # all reviews characters converts to lowercase
df['content'] = df.content.apply(lambda x: re.sub(r'\d+','', x))  # remove the numbers in reviews
df['content'] = df.content.apply(lambda x: re.sub(r'[^\w\s]', ' ',x)) # remove the punctuations in reviews
df['content'] = df.content.apply(lambda x: x.strip()) # remove whitespaces
df['content'] = df.content.apply(lambda x:word_tokenize(x)) # tokenization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 
df['content'] = df.content.apply(lambda x:[lemmatizer.lemmatize(x[i], pos = 'v') for i in range(len(x))])   # lemmatization
df['content'] = df.content.apply(lambda x:[x[i] for i in range(len(x)) if not x[i] in stop_words])  # remove stop words
clean_words = ['get', 'go', 'say', 'tell', 'ask', 'also', 'would', 'en', 'amsterdam', 'i', 'im', 'ambassade', 'europe', 'tripadvisor', 'germany', 'hungary', 'netherlands', 'marriot', 'marriott', 'hampton', 'hilton', 'america', 'aa', 'april' 'u', 'bahn', 'four', 'seasons', 'gresham', 'president', 'green', 'shangrila', 'shangri', 'budapest', 'buda', 'astoria', 'alexanderplatz',
               'brandenburg','sbahn', 'zoo', 'kurfürstendam', 'kudamm', 'berlin', 'australia', 'nd', 'th', 'st', 'mr', 'mrs', 'belfort', 'radisson', 'blu', 'london', 'brugge', 'turkey', 'istanbul', 'bosphorus', 'conrad', 'ıstanbul', 'isg', 'ısg', 'divan', 'asia', 'sheraton', 'atakoy', 'ataköy'
               'londonbruggeamsterdam', 'usa', 'september', 'anne', 'frank', 'herengracht', 'basel', 'switzerland', 'european', 'november', 'tmy', 'uk', 'october', 'february', 'december', 'holland', 'schipol', 'amstel', 'singel', 'rotterdam', 'january', 'sultanahmet', 'princess', 'bostanci',
               'cambridge', 'england', 'airways', 'victoria', 'lane', 'grosvenor', 'euston', 'square', 'garden', 'court', 'newham', 'oyo', 'somerset', 'spain', 'madrid', 'atlantico', 'plaza', 'espana', 'es', 'melia', 'arosa', 'dome', 'urban', 'tepa', 'nh', 'collection', 'palacio', 'de',
               'roosevelt', 'york', 'ritz', 'carlton', 'ritzcarlton', 'doubletree', 'chelsea', 'riverside', 'tower', 'windsor', 'novotel', 'paris', 'france', 'les', 'halles', 'sure', 'western', 'gare', 'du', 'nord', 'normandy', 'chansonniers', 'aston', 'champs', 'elysees', 'ibis', 'ıbis',
               'porte', 'montmartre', 'zurich', 'prague', 'ea', 'esprit', 'art', 'deco', 'imperial', 'mandarin', 'oriental', 'botanique', 'stary', 'pivovar', 'al', 'czech', 'republic', 'manthia', 'via', 'veneto', 'mecenate', 'amalfi', 'roma', 'rome', 'italy', 'milton', 'colonna', 'et', 
               'noir', 'dei', 'massimi', 'olympia', 'xtra', 'crowne', 'schweizerhof', 'widder', 'theres' 'dorint', 'da', 'vinci', 'splendid', 'isnt', 'ısnt', 'arent', 'aint', 'wont', 'didnt', 'shant', 'havent', 'hadnt', 'hadnt', 'hasnt', 'dont', 'wasnt', 'werent', 'doesnt', 'weve', 'couldnt', 
               'thats', 'ive', 'youre']
df['content'] = df.content.apply(lambda x:[x[i] for i in range(len(x)) if not x[i] in clean_words])# remove special names (hotel and city names) and pointless words
df.to_csv('preprocessed.csv', index=False)