### Open file and prepare Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import json

In [2]:
df = pd.read_csv("./data/515k-hotel-reviews-data-in-europe.zip")

#### Create country column

In [3]:
def get_country(adress):
    country = adress.split()[-1]
    if country == "Kingdom":
        return ("United Kingdom")
    else:
        return (country)


df['Country'] = df.Hotel_Address.apply(lambda x: get_country(x))
df.Country.value_counts()

United Kingdom    262301
Spain              60149
France             59928
Netherlands        57214
Austria            38939
Italy              37207
Name: Country, dtype: int64

#### Create city column

In [4]:
def get_city(adress, country):
    city = adress.split()[-2]
    if country == "United Kingdom":
        return (adress.split()[-5])
    else:
        return (city)


df['City'] = df[['Hotel_Address',
                 'Country']].apply(lambda x: get_city(x[0], x[1]), axis=1)
df.City.value_counts()

London       262301
Barcelona     60149
Paris         59928
Amsterdam     57214
Vienna        38939
Milan         37207
Name: City, dtype: int64

#### Create Tags columns

Steps
- Clean field tags of strings that make the field not to be recognised as a List
- After some EDA we have found the patterns that define each one of the 6 possible fields in tags. Most of the cells have less than 6 tags. The strategy consisted in add blank fields in the list that filled the positions without an expected category tag
- The third step has been to split the Tags column into a new Data Frame
- As a last step we have merged the original Data Frame with the Data Frame with the tags separated in columns.

In [5]:
Tags = df.Tags
Tags = Tags.apply(lambda x: x.replace(
    '[',
    '',
).replace(']', '').replace("' ", '').replace(" '", '').split(', '))

In [6]:
for i, tag in enumerate(Tags):
    if (tag[0] != 'With a pet' and tag[0] != ''):
        Tags[i].insert(0, '')

    if (tag[1] != 'Leisure trip' and tag[1] != 'Business trip'
            and tag[1] != ''):
        Tags[i].insert(1, '')

    if (tag[2] != 'Couple' and tag[2] != 'Solo traveler' and tag[2] != 'Group'
            and tag[2] != 'Family with young children'
            and tag[2] != 'Family with older children'
            and tag[2] != 'Travelers with friends' and tag[2] != ''):
        Tags[i].insert(2, '')

    if (tag[-1][:14] != 'Submitted from' and tag[-1] != ''):
        Tags[i].append('')

    if (tag[-2][:6] != 'Stayed' and tag[-2] != ''):
        Tags[i].insert(-1, '')

    if len(Tags[i]) < 6:
        Tags[i].insert(3, '')

In [7]:
# check that all the reviews have now 6 fields
Tags.apply(lambda x: len(x)).value_counts()

6    515738
Name: Tags, dtype: int64

In [8]:
dfTags = pd.DataFrame(Tags)

In [9]:
dfTags_extended = pd.DataFrame(dfTags.Tags.values.tolist(), index=dfTags.index)
dfTags_extended.columns = [
    'Pet', 'Purpose', 'Whom', 'Room', 'Length', 'Device'
]
dfTags_extended.shape

(515738, 6)

In [10]:
dfFull = pd.concat([df, dfTags_extended], axis=1, sort=False)

In [11]:
dfFull.to_csv('./data/df_with_tags.csv', index_label=False)

##### Exploratory Analysis from Tags

In [12]:
dfTags_extended.Pet.value_counts()

              514333
With a pet      1405
Name: Pet, dtype: int64

In [13]:
dfTags_extended.Purpose.value_counts()

Leisure trip     417778
Business trip     82939
                  15021
Name: Purpose, dtype: int64

In [14]:
dfTags_extended.Whom.value_counts()

Couple                        252294
Solo traveler                 108545
Group                          65392
Family with young children     61015
Family with older children     26349
Travelers with friends          2143
Name: Whom, dtype: int64

In [15]:
len(dfTags_extended.Room.value_counts())

2388

In [16]:
dfTags_extended.Length.value_counts().head()

Stayed 1 night     193645
Stayed 2 nights    133937
Stayed 3 nights     95821
Stayed 4 nights     47817
Stayed 5 nights     20845
Name: Length, dtype: int64

In [17]:
dfTags_extended.Device.value_counts().head()

Submitted from a mobile device    307640
                                  208098
Name: Device, dtype: int64

#### Create Diff column (Review Score - Hotel Average Score)

In [18]:
dfFull['Diff'] = dfFull.Reviewer_Score - dfFull.Average_Score

#### Save File

In [19]:
dfFull = pd.concat([df, dfTags_extended], axis=1, sort=False)

In [20]:
dfFull.to_csv('./data/df_with_tags.csv', index_label=False)