In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

from langdetect import detect
from googletrans import Translator

import re

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /Users/iudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
listings=pd.read_csv('../data/raw_data/listings.csv.gz', compression='gzip', low_memory=False)


for c in listings.select_dtypes(include='int'):
    listings[c]=pd.to_numeric(listings[c], downcast='integer')

for c in listings.select_dtypes(include='float'):
    listings[c]=pd.to_numeric(listings[c], downcast='float')
    
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21495 entries, 0 to 21494
Columns: 106 entries, id to reviews_per_month
dtypes: float32(22), float64(1), int16(8), int32(5), int64(1), int8(7), object(62)
memory usage: 159.6 MB


In [3]:
reviews=pd.read_csv('../data/raw_data/reviews.csv.gz', compression='gzip', low_memory=False)

for c in reviews.select_dtypes(include='int'):
    reviews[c]=pd.to_numeric(reviews[c], downcast='integer')

reviews.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812726 entries, 0 to 812725
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     812726 non-null  int32 
 1   id             812726 non-null  int32 
 2   date           812726 non-null  object
 3   reviewer_id    812726 non-null  int32 
 4   reviewer_name  812725 non-null  object
 5   comments       812351 non-null  object
dtypes: int32(3), object(3)
memory usage: 373.0 MB


In [4]:
primera=reviews.groupby('listing_id').first().reset_index()

primera.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6369,29428,2010-03-14,84790,Nancy,Simon and Arturo have the ultimate location in...
1,21853,21051116,2014-10-10,8506071,Pedro Abel,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,23330835,2014-11-29,9091591,Berk,"During my stay, I enjoyed all around and had a..."
3,24836,294356,2011-06-02,18734,Albert,Incredible location! Tenty and Goyo were very...
4,26825,41524,2010-05-10,45915,Cybill,"Agustina is a great host, she is very thoughtf..."


In [None]:
total=listings.merge(primera, left_on='id', right_on='listing_id')[['id_x', 'price', 'comments']]

total=total.dropna()

total=dd.from_pandas(total, npartitions=5)

total.head()

In [5]:
total.price=total.price.apply(lambda x: float(x.replace('$', '').replace(',', '')))

total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17202 entries, 0 to 17203
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_x      17202 non-null  int32  
 1   price     17202 non-null  float64
 2   comments  17202 non-null  object 
dtypes: float64(1), int32(1), object(1)
memory usage: 470.4+ KB


In [6]:
total.head()

Unnamed: 0,id_x,price,comments
0,6369,70.0,Simon and Arturo have the ultimate location in...
1,21853,17.0,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,80.0,"During my stay, I enjoyed all around and had a..."
3,24836,115.0,Incredible location! Tenty and Goyo were very...
4,26825,25.0,"Agustina is a great host, she is very thoughtf..."


In [7]:
%%time
def detect_lang(x):
    try:
        return detect(x)
    except:
        return 'unknown'

#total['lang']=total.comments.apply(detect_lang)

total.head()

CPU times: user 37 µs, sys: 7 µs, total: 44 µs
Wall time: 41 µs


Unnamed: 0,id_x,price,comments
0,6369,70.0,Simon and Arturo have the ultimate location in...
1,21853,17.0,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,80.0,"During my stay, I enjoyed all around and had a..."
3,24836,115.0,Incredible location! Tenty and Goyo were very...
4,26825,25.0,"Agustina is a great host, she is very thoughtf..."


In [8]:
#total.lang.value_counts()

In [9]:
nlp=spacy.load('en_core_web_sm')   # modelo preentrenado, ingles

# stopwords
stop_words_en=set(stopwords.words('english') + list(STOP_WORDS) + list(nlp.Defaults.stop_words) + ['edit', 'plot'])

stop_words=stop_words_en 

In [11]:
%%time

total.comments=total.comments.apply(lambda x: Translator().translate(x, dest='en').text)

CPU times: user 3min 57s, sys: 27.2 s, total: 4min 24s
Wall time: 58min 35s


In [12]:
%%time

total.comments=total.comments.apply(lambda x: ' '.join([ent.text for ent in nlp(x) if not ent.ent_type_]))

CPU times: user 2min 32s, sys: 643 ms, total: 2min 32s
Wall time: 2min 33s


In [13]:
def spacy_tokenizer(frase):

    tokens=word_tokenize(frase)  # lista de palabras
    
    filtrado=[]
    
    for palabra in tokens:
        
        lemma=nlp(str(palabra))[0].lemma_.lower().strip()
        
        if lemma not in stop_words and re.search('^[a-zA-Z]+$', lemma) and len(lemma)>2:
            filtrado.append(lemma)
            
    return filtrado

In [14]:
spacy_tokenizer(total.comments[1])

['experience',
 'house',
 'good',
 'begin',
 'cost',
 'bit',
 'agree',
 'entrance',
 'end',
 'manage',
 'find',
 'convenient',
 'time',
 'house',
 'especially',
 'kitchen',
 'somewhat',
 'messy',
 'room',
 'clean',
 'tidy',
 'site',
 'quiet',
 'neighborhood',
 'connect']

In [15]:
total.comments[1]

"My experience at 's house was good , although at the beginning it cost us a bit to agree to make the entrance , in the end we managed to find a convenient time for both of us . Although the house and especially the kitchen was somewhat messy , the room was clean and tidy . The site is a quiet neighborhood and well connected ."

In [16]:
tfidf=TfidfVectorizer(min_df=0.05, 
                      tokenizer=spacy_tokenizer, 
                      #stop_words=stop_words,
                      #ngram_range=(1, 1)
                     )

In [17]:
%%time

tfidf_matrix=tfidf.fit_transform(total.comments)

CPU times: user 27min 36s, sys: 2.48 s, total: 27min 38s
Wall time: 27min 38s


In [18]:
tfidf_matrix.shape

(17202, 63)

In [19]:
terms=tfidf.get_feature_names_out()

terms[:15], len(terms)

(array(['accommodation', 'apartment', 'area', 'arrival', 'attentive',
        'automate', 'bathroom', 'beautiful', 'bed', 'cancel', 'center',
        'central', 'city', 'clean', 'close'], dtype=object),
 63)

In [20]:
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

tfidf_df.head()

Unnamed: 0,accommodation,apartment,area,arrival,attentive,automate,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,equip,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,home,host,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296765,0.306338,0.0,0.0,0.0,0.0,0.0,0.288985,0.0,0.0,0.0,0.0,0.0,0.306765,0.0,0.376163,0.319619,0.0,0.0,0.317564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196776,0.0,0.0,0.0,0.0,0.0,0.0,0.380492,0.0,0.0,0.0,0.268568,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335624,0.0,0.0,0.0,0.216001,0.0,0.0,0.0,0.0,0.0,0.0,0.584324,0.0,0.340663,0.0,0.0,0.0,0.0,0.0,0.0,0.331369,0.0,0.0,0.0,0.0,0.0,0.0,0.290718,0.0,0.0,0.0,0.263735,0.0,0.0,0.0,0.0,0.0,0.0,0.274604,0.0,0.0
2,0.0,0.255669,0.449461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301644,0.0,0.0,0.0,0.0,0.0,0.0,0.518891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610231,0.0,0.0,0.0,0.0,0.0
3,0.0,0.251769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317225,0.398585,0.0,0.0,0.0,0.0,0.0,0.0,0.424584,0.0,0.0,0.0,0.452021,0.0,0.307782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.427686,0.0,0.0,0.0,0.0,0.0,0.0,0.263714,0.33135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511729,0.0,0.405029,0.0,0.0,0.256647,0.0,0.0,0.0,0.0,0.0,0.246937,0.0,0.0,0.0,0.0,0.0,0.291524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
tfidf_df.describe()

Unnamed: 0,accommodation,apartment,area,arrival,attentive,automate,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,equip,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,home,host,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
count,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0
mean,0.026486,0.117365,0.032733,0.033017,0.031909,0.026091,0.01873,0.024718,0.022878,0.026868,0.034942,0.027323,0.025858,0.067726,0.042491,0.053589,0.021164,0.022622,0.022453,0.02228,0.018723,0.047383,0.021879,0.026019,0.029217,0.032806,0.085852,0.092684,0.021484,0.024636,0.031674,0.025279,0.073008,0.039409,0.027563,0.017418,0.024396,0.050289,0.083205,0.021126,0.023864,0.040472,0.021653,0.025087,0.066278,0.021775,0.054645,0.07629,0.026156,0.032209,0.071447,0.02343,0.021879,0.047173,0.022359,0.019623,0.02172,0.082323,0.027364,0.040617,0.038399,0.033577,0.024337
std,0.117497,0.16671,0.109741,0.112973,0.114611,0.110479,0.083982,0.101841,0.093448,0.11197,0.112001,0.109013,0.099327,0.132511,0.11895,0.128885,0.09494,0.098786,0.092529,0.095647,0.086293,0.149832,0.0966,0.096371,0.116001,0.114859,0.184933,0.182036,0.092685,0.09867,0.112336,0.100284,0.135719,0.132652,0.107055,0.078529,0.098042,0.133489,0.15171,0.090671,0.093103,0.115112,0.091383,0.102788,0.154076,0.096687,0.148952,0.16271,0.110322,0.106144,0.140542,0.103564,0.085,0.134176,0.0966,0.089799,0.091293,0.151702,0.108117,0.12638,0.116041,0.103868,0.100736
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.231505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141755,0.0,0.0,0.0,0.0,0.131325,0.0,0.0,0.0,0.0,0.0,0.162207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110945,0.0,0.0,0.0,0.0,0.0,0.0,0.158188,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,0.869172,1.0,0.566767,1.0,1.0,1.0,0.969153,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.901309,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.826572,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.859297,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.670329,1.0,1.0,1.0,0.783148,1.0,1.0,0.900966,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
data=pd.concat([total, tfidf_df], axis=1).drop(columns=['id_x', 'comments'])

data.fillna(0, inplace=True)

data.head()

Unnamed: 0,price,accommodation,apartment,area,arrival,attentive,automate,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,equip,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,home,host,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296765,0.306338,0.0,0.0,0.0,0.0,0.0,0.288985,0.0,0.0,0.0,0.0,0.0,0.306765,0.0,0.376163,0.319619,0.0,0.0,0.317564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196776,0.0,0.0,0.0,0.0,0.0,0.0,0.380492,0.0,0.0,0.0,0.268568,0.0
1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335624,0.0,0.0,0.0,0.216001,0.0,0.0,0.0,0.0,0.0,0.0,0.584324,0.0,0.340663,0.0,0.0,0.0,0.0,0.0,0.0,0.331369,0.0,0.0,0.0,0.0,0.0,0.0,0.290718,0.0,0.0,0.0,0.263735,0.0,0.0,0.0,0.0,0.0,0.0,0.274604,0.0,0.0
2,80.0,0.0,0.255669,0.449461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301644,0.0,0.0,0.0,0.0,0.0,0.0,0.518891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610231,0.0,0.0,0.0,0.0,0.0
3,115.0,0.0,0.251769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317225,0.398585,0.0,0.0,0.0,0.0,0.0,0.0,0.424584,0.0,0.0,0.0,0.452021,0.0,0.307782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427686,0.0,0.0,0.0,0.0,0.0,0.0,0.263714,0.33135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511729,0.0,0.405029,0.0,0.0,0.256647,0.0,0.0,0.0,0.0,0.0,0.246937,0.0,0.0,0.0,0.0,0.0,0.291524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [42]:
X=data.drop('price', axis=1)

y=data.price

In [43]:
X_train, X_test, y_train, y_test=tts(X, y)

X_train.head()

Unnamed: 0,accommodation,apartment,area,arrival,attentive,automate,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,equip,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,home,host,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
6857,0.0,0.297622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428505,0.611036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
754,0.0,0.158172,0.278064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099647,0.0,0.0,0.0,0.0,0.15577,0.0,0.0,0.0,0.159567,0.441678,0.0,0.0,0.102694,0.386724,0.158895,0.153045,0.143366,0.0,0.193953,0.0,0.0,0.0,0.304373,0.120337,0.093308,0.0,0.0,0.0,0.157544,0.0,0.110156,0.0,0.0,0.0,0.0,0.138217,0.195242,0.0,0.0,0.0,0.0,0.164221,0.0,0.283144,0.0,0.0,0.130556,0.133237,0.155043
2655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.720887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334293,0.0,0.0,0.0,0.0,0.635269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249055,0.0,0.345456,0.0,0.0,0.0
14950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.360969,0.0,0.0,0.0,0.0,0.0,0.379209,0.0,0.0,0.389394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.678512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246031,0.0,0.0,0.231007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x2abfad5e0>

In [46]:
y_pred=modelo.predict(X_test)

In [47]:
print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

RMSE: 380.8813401956164
MAE: 80.051527323711
R2: -0.040668755683688884


In [49]:
dict(zip(X.columns, modelo.feature_importances_))   

{'accommodation': 0.0689455957461982,
 'apartment': 2.583348704864902,
 'area': 1.6232290205670838,
 'arrival': 0.030497308811408882,
 'attentive': 0.08761519378847529,
 'automate': 0.0038273834884368758,
 'bathroom': 0.32965533911625927,
 'beautiful': 0.15096830455477275,
 'bed': 0.20133760410017237,
 'cancel': 0.00011795794668213536,
 'center': 3.6274906714724726,
 'central': 1.7239283641876875,
 'city': 0.18402455012468374,
 'clean': 0.5959314669323307,
 'close': 2.3108264560679026,
 'comfortable': 0.8081552384873124,
 'communication': 1.1973749144954748,
 'cozy': 0.04417851465354841,
 'definitely': 2.0938733396469256,
 'easy': 3.116799032931333,
 'equip': 0.07797860802423723,
 'excellent': 0.28527127327182655,
 'experience': 0.09700316989107358,
 'feel': 0.23327040724215464,
 'flat': 0.7876342527547727,
 'friendly': 4.022096625676751,
 'good': 0.39428807517305997,
 'great': 4.313213074630407,
 'help': 2.0633272225230543,
 'helpful': 0.6514906310631602,
 'highly': 8.384554163205495,

In [69]:
# top 10

carac={k:v for k,v in zip(X.columns, modelo.feature_importances_) if v>7.5}

carac

{'apartment': 11.771411303610563,
 'friendly': 10.2445083155243,
 'highly': 8.054403393692843,
 'locate': 9.465872595667458,
 'location': 9.297442177840352,
 'nice': 8.639297701699562,
 'place': 8.052983379515114,
 'repeat': 8.14408259241404}

In [72]:
X_train, X_test, y_train, y_test=tts(X, y)

modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

y_pred=modelo.predict(X_test)

print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

RMSE: 305.9182889204606
MAE: 76.46723845308809
R2: -0.03997772148661416
