In [71]:
# generate an nlp model to cluster 21 different appartments based on descriptions and slightly fluctauting prices

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from unidecode import unidecode
from num2words import num2words

In [72]:
# import data
jan_apr_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jan_apr.csv')
may_june_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/may_june.csv')
jul_aug_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jul_aug.csv')

data_desc = pd.concat([jan_apr_desc, may_june_desc, jul_aug_desc])

jan_mar_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jan_mar.csv')
apr_jun_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/apr_jun.csv')
jul_set_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jul_set.csv')
aug_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/aug.csv')

data_pri = pd.concat([jan_mar_pri, apr_jun_pri, jul_set_pri, aug_pri])


In [73]:
data = pd.merge(data_desc, data_pri, on='Reservation ID', how='left')

data = data[data['Channel_x'] == 'Booking.com']

In [74]:
data_cols = data[['Reservation ID', 'Room', 'Total Amount']]

In [75]:
data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)


In [76]:
data_cols.price = data_cols.price.str[:-5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.str[:-5]


In [77]:
data_cols.price = data_cols.price.astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.astype('float')


In [78]:
# use nlp to cluster appartments based on descriptions
data_cols = data_cols.drop_duplicates(subset='id', keep=False)
data = data.drop_duplicates(subset='Reservation ID', keep=False)

def clean_text(text):
    text = unidecode(text)
    text = text.lower()
    return text

data_cols['desc'] = data_cols['desc'].apply(clean_text)

# select number before "quarto" and make it a column
def add_n_rooms(text):
    n_rooms = re.findall(r'(\d+)\s+quarto', text)
    if len(n_rooms) > 0:
        return int(n_rooms[0])
    else:
        return 1
    
data_cols['n_rooms'] = data_cols['desc'].apply(add_n_rooms)

# add a columns for road
def add_road(text):
    rua = re.findall(r'rua\s+(\w+)', text)
    if len(rua) > 0:
        return rua[0]
    else:
        return 'no_road'
    
data_cols['road'] = data_cols['desc'].apply(add_road)

# remove stuff that I've noticed is not relevant
def remove_manually(text):
    text = re.sub(r'\d{1,2}%', '', text)
    text = text.replace('united states pos', '')
    text = text.replace('nao reembolsavel', '')
    text = text.replace('()', '')
    text = text.replace('-', '')
    text = text.replace('general', '')
    text = text.replace('oferta', '')
    text = text.replace('apartamento', '')
    text = text.replace('com', '')
    text = text.replace('high season', '')
    text = text.replace('night', '')
    text = text.replace('reserva antecipada', '')
    text = re.sub(r'\d{1,2} adultos', '', text)
    text = re.sub(r'\d{1,2} de \w{3} de', '', text)
    text = re.sub(r'\w{3} \d{1,2}', '', text)
    text = re.sub(r'\d', '', text)
    return text


data_cols['desc'] = data_cols['desc'].apply(remove_manually)


# lemmatize data
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(lemmatize_text)

# stem data
stemmer = PorterStemmer()
def stem_text(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(stem_text)

# remove stopwords
stop_words = set(stopwords.words('portuguese'))

def remove_stopwords(text):
    text = [word for word in text.split() if word not in stop_words]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(remove_stopwords)

# vectorize data
vectorizer = TfidfVectorizer()

# one hot encode road
road_ohe = pd.get_dummies(data_cols['road'])

# define X, which contains a vector for each description and a price
vectors = vectorizer.fit_transform(data_cols['desc'])
vectors_df = pd.DataFrame(vectors.toarray())
price = data_cols['price']
n_rooms = data_cols['n_rooms']
road_ohe = pd.get_dummies(data_cols['road'])
X = np.column_stack((vectors.toarray(), n_rooms, road_ohe))


# cluster data
kmeans_room = KMeans(n_clusters=21, random_state=0).fit(X)
kmeans_type = KMeans(n_clusters=14, random_state=0).fit(X)

# add cluster column to data
data_cols['cluster_room'] = kmeans_room.labels_
data_cols['cluster_type'] = kmeans_type.labels_

# add cluster column to data
data['cluster_room'] = kmeans_room.labels_
data['cluster_type'] = kmeans_type.labels_


In [79]:
data_cols.head(10)


Unnamed: 0,id,desc,price,n_rooms,road,cluster_room,cluster_type
0,2468606756,piso terreo inacio,82.08,1,no_road,0,2
1,3404160971,"quarto () rua remedios, inacio",123.12,1,dos,1,12
2,3545920362,"quarto rua santo estevao, s",379.62,1,de,18,3
3,3807966157,piso terreo s,179.82,1,no_road,0,2
4,2769935846,"quarto () rua remedios, n.o s",390.94,2,dos,11,1
5,3554122759,quarto s,253.08,1,no_road,9,0
6,3554197542,"quarto () rua remedios, n.o limitedtim deal",145.08,2,dos,11,1
7,2719193349,"quarto () rua remedios, n.o limitedtim deal",199.02,2,dos,11,1
8,2841079705,"quarto rua remedios, limitedtim deal",564.2,1,dos,1,12
9,3086509714,"superior quarto rua remedios, geniusgeor",180.88,1,dos,12,12


In [80]:
data_cols.id.nunique()

1388

In [82]:
data_cols.groupby('cluster_room').cluster_type.nunique()

cluster_room
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    3
17    1
18    1
19    5
20    1
Name: cluster_type, dtype: int64

In [83]:
data_cols[data_cols.cluster_room == 15][['desc', 'n_rooms']]

Unnamed: 0,desc,n_rooms
102,quarto tempo limitado,1
148,quarto tempo limitado,1
454,quarto tempo limitado,1
503,quarto tempo limitado,1
594,quarto tempo limitado,1
702,quarto tempo limitado,1
747,quarto tempo limitado,1
864,quarto tempo limitado,1
1010,quarto tempo limitado,1
1036,quarto tempo limitado,1


In [84]:
data_cols.groupby('cluster_room').n_rooms.nunique()

cluster_room
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
Name: n_rooms, dtype: int64

In [106]:
data.Room[data.cluster_room == 1].values

array(['Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - 24% - Oferta InÃ\xadcio de 2022 - 7 de Dez de 2021',
       'Apartamento com 1 Quarto - Rua dos Remédios, 3 - 38% - Limited-time Deal - 3 Jan 2022',
       'Apartamento com 1 Quarto - Rua dos Remédios, 3 - 38% - Limited-time Deal - 3 Jan 2022',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - 38% - Limited-time Deal - 3 Jan 2022',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - 24% - Oferta InÃ\xadcio de 2022 - 7 de Dez de 2021',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - General - 24% - Oferta InÃ\xadcio de 2022 - 7 de Dez de 2021',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - General - High Season 3 Nights',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - General - High Season 3 Nights',
       'Apartamento com 1 Quarto (2 Adultos) - Rua dos Remédios, 30 - General - 24% - Oferta InÃ\xadcio de 2022 - 