In [1]:
# predict the apartment based on a description

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# import data
jan_apr_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jan_apr.csv')
may_june_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/may_june.csv')
jul_aug_desc = pd.read_csv('dados_afonso/room_say_report/reservas_por_data_criada/jul_aug.csv')

data_desc = pd.concat([jan_apr_desc, may_june_desc, jul_aug_desc])

jan_mar_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jan_mar.csv')
apr_jun_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/apr_jun.csv')
jul_set_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/jul_set.csv')
aug_pri = pd.read_csv('dados_afonso/reservation_summary/reservas_por_data_criada/aug.csv')

data_pri = pd.concat([jan_mar_pri, apr_jun_pri, jul_set_pri, aug_pri])

In [3]:
data = pd.merge(data_desc, data_pri, on='Reservation ID', how='left')

data = data[data['Channel_x'] == 'Booking.com']

In [4]:
data_cols = data[['Reservation ID', 'Room', 'Total Amount']]

In [5]:
data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.rename(columns={'Reservation ID' : 'id', 'Room' : 'desc', 'Total Amount': 'price'}, inplace=True)


In [6]:
data_cols.price = data_cols.price.str[:-5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.str[:-5]


In [7]:
data_cols.price = data_cols.price.astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cols.price = data_cols.price.astype('float')


In [None]:
# use nlp to cluster appartments based on descriptions

# clean data but leave numbers
data_cols = data_cols.drop_duplicates(subset='id', keep=False)
data = data.drop_duplicates(subset='Reservation ID', keep=False)

def clean_text(text):
    text = unidecode(text)
    text = text.lower()
    return text

data_cols['desc'] = data_cols['desc'].apply(clean_text)

# select number before "quarto" and make it a column
def add_n_rooms(text):
    n_rooms = re.findall(r'(\d+)\s+quarto', text)
    if len(n_rooms) > 0:
        return int(n_rooms[0])
    else:
        return 1
    
data_cols['n_rooms'] = data_cols['desc'].apply(add_n_rooms)

# add a columns for road
def add_road(text):
    rua = re.findall(r'rua\s+(\w+)', text)
    if len(rua) > 0:
        return rua[0]
    else:
        return 'no_road'
    
data_cols['road'] = data_cols['desc'].apply(add_road)

# remove stuff that I've noticed is not relevant
def remove_manually(text):
    text = re.sub(r'\d{1,2}%', '', text)
    text = text.replace('nao reembolsavel', '')
    text = text.replace('()', '')
    text = text.replace('-', '')
    text = text.replace('gener', '')
    text = text.replace('oferta', '')
    text = text.replace('apartamento', '')
    text = text.replace('com', '')
    text = text.replace('high season', '')
    text = text.replace('night', '')
    text = re.sub(r'\d{1,2} adultos', '', text)
    text = re.sub(r'\d{1,2} de \w{3} de', '', text)
    text = re.sub(r'\w{3} \d{1,2}', '', text)
    text = re.sub(r'\d', '', text)
    return text


data_cols['desc'] = data_cols['desc'].apply(remove_manually)


# lemmatize data
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(lemmatize_text)

# stem data
stemmer = PorterStemmer()
def stem_text(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(stem_text)

# remove stopwords
stop_words = set(stopwords.words('portuguese'))

def remove_stopwords(text):
    text = [word for word in text.split() if word not in stop_words]
    return " ".join(text)

data_cols['desc'] = data_cols['desc'].apply(remove_stopwords)

# vectorize data
vectorizer = TfidfVectorizer()

# one hot encode road
road_ohe = pd.get_dummies(data_cols['road'])

# define X, which contains a vector for each description and a price
vectors = vectorizer.fit_transform(data_cols['desc'])
vectors_df = pd.DataFrame(vectors.toarray())
price = data_cols['price']
n_rooms = data_cols['n_rooms']
road_ohe = pd.get_dummies(data_cols['road'])
X = np.column_stack((vectors.toarray(), n_rooms, road_ohe))


# cluster data
kmeans_room = KMeans(n_clusters=21, random_state=0).fit(X)
kmeans_type = KMeans(n_clusters=14, random_state=0).fit(X)

# add cluster column to data
data_cols['cluster_room'] = kmeans_room.labels_
data_cols['cluster_type'] = kmeans_type.labels_

# add cluster column to data
data['cluster_room'] = kmeans_room.labels_
data['cluster_type'] = kmeans_type.labels_
