# **Trip Advisor hotel reviews**
## DATA PREPARATION FOR ML MODELING
***
***

In [1]:
# import packages
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# set no limit to raw display with pandas
pd.set_option('display.max_rows', None)

# define working paths
PROJECT_PATH = os.getcwd()
RAWDATA_PATH = os.path.join(PROJECT_PATH, os.pardir, 'data', 'raw')
PROCDATA_PATH = os.path.join(PROJECT_PATH, os.pardir, 'data', 'proccessed')
IMAGES_PATH = os.path.join(PROJECT_PATH, os.pardir, 'images')

In [2]:
# read raw data
file_1 = os.path.join(RAWDATA_PATH, "base tripadvisor hotel.csv")
df_raw_data = pd.read_csv(file_1,  sep=",  ,", engine='python')

df_raw_data = df_raw_data.reset_index()
df_raw_data = df_raw_data.rename(columns={"index":"review", "Review,Rating":"rating"})

## Cleaning text
***

In [3]:
df_raw_data.head()

Unnamed: 0,review,rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
# define preprocessing text funcition
stop_words=set(stopwords.words('english'))

def text_preprocessing(text):
    text=re.sub('[^a-zA-Z]',' ',text)
    text=text.lower().split()
    ps=PorterStemmer()
    clean_word=[ps.stem(i) for i in text if not i in stop_words]
    sen=' '.join(clean_word)
    return sen

In [5]:
# apply preprocessing function to reviews
df_raw_data['clean_review']=df_raw_data["review"].apply(text_preprocessing)
df_raw_data.head()

Unnamed: 0,review,rating,clean_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experi hotel monaco seattl good hote...
3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...


In [6]:
# new length of word in sentence
df_raw_data['length'] = df_raw_data['review'].apply(len)
df_raw_data['clean_length'] = df_raw_data['clean_review'].apply(len)
df_raw_data.head()

Unnamed: 0,review,rating,clean_review,length,clean_length
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,590,481
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...,1686,1407
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experi hotel monaco seattl good hote...,1424,1174
3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...,597,506
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,1278,1058


In [7]:
# check lenght difference after text cleaning
df_raw_data.describe().round(2)

Unnamed: 0,rating,length,clean_length
count,20491.0,20491.0,20491.0
mean,3.95,721.9,597.84
std,1.23,689.1,564.26
min,1.0,41.0,31.0
25%,3.0,336.0,282.0
50%,4.0,534.0,444.0
75%,5.0,856.0,709.0
max,5.0,13498.0,11189.0


In [8]:
# save clean data
df_reviews = df_raw_data[["clean_review", "rating"]]
df_reviews.to_csv(os.path.join(PROCDATA_PATH, "clean_reviews_data.csv"), index=False)