In [92]:
# generate an nlp model to cluster 21 different appartments based on descriptions and slightly fluctauting prices

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from unidecode import unidecode
from num2words import num2words

In [93]:
# import data
jan_apr_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jan_apr.csv')
may_june_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/may_june.csv')
jul_aug_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jul_aug.csv')

data_desc = pd.concat([jan_apr_desc, may_june_desc, jul_aug_desc])

jan_mar_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jan_mar.csv')
apr_jun_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/apr_jun.csv')
jul_set_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jul_set.csv')
aug_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/aug.csv')

data_pri = pd.concat([jan_mar_pri, apr_jun_pri, jul_set_pri, aug_pri])


In [94]:
data = pd.merge(data_desc, data_pri, on='Reservation ID', how='left')

data = data[data['Channel_x'] == 'Booking.com']

In [95]:
data_cols = data[['Reservation ID', 'Room', 'Total Amount']]

In [96]:
data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)


In [97]:
data_cols.price = data_cols.price.str[:-5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.str[:-5]


In [98]:
data_cols.price = data_cols.price.astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.astype('float')


Unnamed: 0,id,desc,price
0,2468606756,Apartamento - Piso Térreo - 24% - Oferta InÃ­c...,82.08
1,3404160971,Apartamento com 1 Quarto (2 Adultos) - Rua dos...,123.12
2,3545920362,Apartamento com 1 Quarto - Rua de Santo Estêvã...,379.62
3,3807966157,Apartamento - Piso Térreo - High Season 3 Nights,179.82
4,2769935846,Apartamento com 2 Quartos (4 Adultos) - Rua do...,390.94


In [149]:
# use nlp to cluster appartments based on descriptions

# clean data but leave numbers
data_cols = data_cols.drop_duplicates(subset='id', keep=False)
data = data.drop_duplicates(subset='Reservation ID', keep=False)

def clean_text(text):
    text = unidecode(text)
    text = text.lower()
    return text

data_cols['desc'] = data_cols['desc'].apply(clean_text)

# select number before "quarto" and make it a column
def add_n_rooms(text):
    n_rooms = re.findall(r'(\d+)\s+quarto', text)
    if len(n_rooms) > 0:
        return int(n_rooms[0])
    else:
        return 1
    
data_cols['n_rooms'] = data_cols['desc'].apply(add_n_rooms)

# add a columns for road
def add_road(text):
    rua = re.findall(r'rua\s+(\w+)', text)
    if len(rua) > 0:
        return rua[0]
    else:
        return 'no_road'
    
data_cols['road'] = data_cols['desc'].apply(add_road)

# remove stuff that I've noticed is not relevant
def remove_manually(text):
    text = re.sub(r'\d{1,2}%', '', text)
    text = text.replace('nao reembolsavel', '')
    text = text.replace('()', '')
    text = text.replace('-', '')
    text = text.replace('gener', '')
    text = text.replace('oferta', '')
    text = text.replace('apartamento', '')
    text = text.replace('com', '')
    text = text.replace('high season', '')
    text = text.replace('night', '')
    text = re.sub(r'\d{1,2} adultos', '', text)
    text = re.sub(r'\d{1,2} de \w{3} de', '', text)
    text = re.sub(r'\w{3} \d{1,2}', '', text)
    text = re.sub(r'\d', '', text)
    return text


data_cols['desc'] = data_cols['desc'].apply(remove_manually)


# lemmatize data
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(lemmatize_text)

# stem data
stemmer = PorterStemmer()
def stem_text(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(stem_text)

# remove stopwords
stop_words = set(stopwords.words('portuguese'))

def remove_stopwords(text):
    text = [word for word in text.split() if word not in stop_words]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(remove_stopwords)

# vectorize data
vectorizer = TfidfVectorizer()

# one hot encode road
road_ohe = pd.get_dummies(data_cols['road'])

# define X, which contains a vector for each description and a price
vectors = vectorizer.fit_transform(data_cols['desc'])
vectors_df = pd.DataFrame(vectors.toarray())
price = data_cols['price']
n_rooms = data_cols['n_rooms']
road_ohe = pd.get_dummies(data_cols['road'])
X = np.column_stack((vectors.toarray(), n_rooms, road_ohe))


# cluster data
kmeans_room = KMeans(n_clusters=21, random_state=0).fit(X)
kmeans_type = KMeans(n_clusters=14, random_state=0).fit(X)

# add cluster column to data
data_cols['cluster_room'] = kmeans_room.labels_
data_cols['cluster_type'] = kmeans_type.labels_

# add cluster column to data
data['cluster_room'] = kmeans_room.labels_
data['cluster_type'] = kmeans_type.labels_


In [150]:
data_cols.head(10)


Unnamed: 0,id,desc,price,n_rooms,cluster_room,cluster_type,road
0,2468606756,piso terreo inacio,82.08,1,14,5,no_road
1,3404160971,"quarto rua remedios, inacio",123.12,1,15,0,remedios
2,3545920362,"quarto rua santo estevao, s",379.62,1,2,8,santo
3,3807966157,piso ter s,179.82,1,16,5,no_road
4,2769935846,"quarto rua remedios, n.o s",390.94,1,0,0,remedios
5,3554122759,quarto al s,253.08,1,6,7,no_road
6,3554197542,"quarto rua remedios, n.o limitedtim d jan",145.08,1,0,0,remedios
7,2719193349,"quarto rua remedios, n.o limitedtim d jan",199.02,1,0,0,remedios
8,2841079705,"quarto rua remedios, limitedtim d jan",564.2,1,0,0,remedios
9,3086509714,"super quarto rua remedios, geniusgeor",180.88,1,0,0,remedios


In [151]:
data_cols.id.nunique()

1388

In [152]:
data_read = data[['Room', 'cluster_room', 'cluster_type']]
sample = data_read.sample(1)

print(f"{sample['Room']} is room {sample['cluster_room']} which is in cluster {sample['cluster_type']}")


89    Apartamento com 2 Quartos (4 Adultos) - Rua do...
Name: Room, dtype: object is room 89    0
Name: cluster_room, dtype: int32 which is in cluster 89    0
Name: cluster_type, dtype: int32


In [153]:
data_cols.groupby('cluster_room').cluster_type.nunique()

cluster_room
0     1
1     1
2     1
3     2
4     2
5     2
6     2
7     3
8     1
9     2
10    1
11    1
12    1
13    1
14    2
15    1
16    2
17    1
18    1
19    5
20    1
Name: cluster_type, dtype: int64

In [156]:
data_cols[data_cols.cluster_room == 19][['desc', 'n_rooms']]

Unnamed: 0,desc,n_rooms
182,quarto al reserva antecipada,1
195,quarto al reserva antecipada,1
265,quarto al reserva antecipada,1
278,piso terreo terraco al reserva antecipada,1
284,estudio vista rio al reserva antecipada,1
383,quarto al reserva antecipada,1
443,piso terreo al reserva antecipada,1
450,quarto al reserva antecipada,1
457,piso terreo terraco al reserva antecipada,1
459,quarto al reserva antecipada,1


In [157]:
texto = data.Room.iloc[182]

In [158]:
texto

'Apartamento com 2 Quartos e Vista Rio - Escadinhas de Santo Estê - General - Canada POS'