# 4.1 - TFIDF

Voy a intentar una transformación tfidf y usar solamente esos vectores como datos para entrenar un modelo.

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

from langdetect import detect
from googletrans import Translator

import re

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /Users/iudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
listings=pd.read_csv('../data/raw_data/listings.csv.gz', compression='gzip', low_memory=False)
    
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21495 entries, 0 to 21494
Columns: 106 entries, id to reviews_per_month
dtypes: float64(23), int64(21), object(62)
memory usage: 163.8 MB


In [3]:
reviews=pd.read_csv('../data/raw_data/reviews.csv.gz', compression='gzip', low_memory=False)

for c in reviews.select_dtypes(include='int'):
    reviews[c]=pd.to_numeric(reviews[c], downcast='integer')

reviews.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812726 entries, 0 to 812725
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     812726 non-null  int32 
 1   id             812726 non-null  int32 
 2   date           812726 non-null  object
 3   reviewer_id    812726 non-null  int32 
 4   reviewer_name  812725 non-null  object
 5   comments       812351 non-null  object
dtypes: int32(3), object(3)
memory usage: 373.0 MB


In [4]:
primera=reviews.groupby('listing_id').first().reset_index()

primera.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6369,29428,2010-03-14,84790,Nancy,Simon and Arturo have the ultimate location in...
1,21853,21051116,2014-10-10,8506071,Pedro Abel,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,23330835,2014-11-29,9091591,Berk,"During my stay, I enjoyed all around and had a..."
3,24836,294356,2011-06-02,18734,Albert,Incredible location! Tenty and Goyo were very...
4,26825,41524,2010-05-10,45915,Cybill,"Agustina is a great host, she is very thoughtf..."


In [5]:
total=listings.merge(primera, left_on='id', right_on='listing_id')[['id_x', 'price', 'comments']]

total=total.dropna()

total.head()

Unnamed: 0,id_x,price,comments
0,6369,$70.00,Simon and Arturo have the ultimate location in...
1,21853,$17.00,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,$80.00,"During my stay, I enjoyed all around and had a..."
3,24836,$115.00,Incredible location! Tenty and Goyo were very...
4,26825,$25.00,"Agustina is a great host, she is very thoughtf..."


In [6]:
total.price=total.price.apply(lambda x: float(x.replace('$', '').replace(',', '')))

total.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17202 entries, 0 to 17203
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_x      17202 non-null  int64  
 1   price     17202 non-null  float64
 2   comments  17202 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 5.9 MB


In [7]:
%%time

def detect_lang(x: str) -> str:
    """
    Función para detectar el idioma de una string.
    
    param x: string de entrada 
    
    return: string con el idioma (en, es, etc...)
    """
    try:
        return detect(x)
    except:
        return 'unknown'

total['lang']=total.comments.apply(detect_lang)

total.head()

CPU times: user 59.5 s, sys: 424 ms, total: 59.9 s
Wall time: 59.9 s


Unnamed: 0,id_x,price,comments,lang
0,6369,70.0,Simon and Arturo have the ultimate location in...,en
1,21853,17.0,"Mi experiencia en casa de Adel fue buena, aunq...",es
2,24805,80.0,"During my stay, I enjoyed all around and had a...",en
3,24836,115.0,Incredible location! Tenty and Goyo were very...,en
4,26825,25.0,"Agustina is a great host, she is very thoughtf...",en


In [8]:
# voy a traducir al ingles 

total.lang.value_counts()

es         7962
en         7275
fr          794
pt          293
it          229
de          105
zh-cn        68
ru           66
ro           61
ko           61
unknown      53
ca           42
nl           41
af           16
tl           14
so           14
pl           11
id           10
sk           10
ja            8
cs            7
vi            7
da            7
sl            7
tr            6
cy            6
el            5
sv            5
lt            4
sw            3
bg            2
fi            2
zh-tw         1
he            1
uk            1
hu            1
ar            1
lv            1
hr            1
no            1
Name: lang, dtype: int64

In [9]:
nlp=spacy.load('en_core_web_lg')   # modelo preentrenado, ingles

# stopwords
stop_words_en=set(stopwords.words('english') + list(STOP_WORDS) + list(nlp.Defaults.stop_words) + ['edit', 'plot'])

stop_words=stop_words_en 

In [10]:
%%time

total.comments=total.comments.apply(lambda x: Translator().translate(x, dest='en').text)

CPU times: user 12min 57s, sys: 34.9 s, total: 13min 32s
Wall time: 1h 9min 49s


In [11]:
%%time

# elimino las entidades nombradas, mi intención es usar solo adjetivos

total.comments=total.comments.apply(lambda x: ' '.join([ent.text for ent in nlp(x) if not ent.ent_type_]))

CPU times: user 2min 40s, sys: 496 ms, total: 2min 41s
Wall time: 2min 42s


In [12]:
def spacy_tokenizer(frase: str) -> list:

    tokens=word_tokenize(frase)  # lista de palabras
    
    filtrado=[]
    
    for palabra in tokens:
        
        lemma=nlp(str(palabra))[0].lemma_.lower().strip()
        
        if lemma not in stop_words and re.search('^[a-zA-Z]+$', lemma) and len(lemma)>2:
            filtrado.append(lemma)
            
    return filtrado

In [13]:
spacy_tokenizer(total.comments[1])

['experience',
 'good',
 'beginning',
 'cost',
 'bit',
 'agree',
 'entrance',
 'end',
 'manage',
 'find',
 'convenient',
 'time',
 'house',
 'especially',
 'kitchen',
 'somewhat',
 'messy',
 'room',
 'clean',
 'tidy',
 'site',
 'quiet',
 'neighborhood',
 'connect']

In [14]:
total.comments[1]

'My experience at was good , although at the beginning it cost us a bit to agree to make the entrance , in the end we managed to find a convenient time for both of us . Although the house and especially the kitchen was somewhat messy , the room was clean and tidy . The site is a quiet neighborhood and well connected .'

In [15]:
tfidf=TfidfVectorizer(min_df=0.05, 
                      tokenizer=spacy_tokenizer, 
                      ngram_range=(1, 3))

In [16]:
%%time

tfidf_matrix=tfidf.fit_transform(total.comments)

CPU times: user 33min 32s, sys: 2.97 s, total: 33min 35s
Wall time: 33min 36s


In [17]:
tfidf_matrix.shape

(17202, 67)

In [18]:
terms=tfidf.get_feature_names_out()

terms[:15], len(terms)

(array(['accommodation', 'apartment', 'area', 'arrival',
        'arrival automate', 'arrival automate post', 'attentive',
        'automate', 'automate post', 'bathroom', 'beautiful', 'bed',
        'cancel', 'center', 'central'], dtype=object),
 67)

In [19]:
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

tfidf_df.head()

Unnamed: 0,accommodation,apartment,area,arrival,arrival automate,arrival automate post,attentive,automate,automate post,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,highly recommend,home,host,host cancel,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285344,0.29416,0.0,0.0,0.0,0.0,0.0,0.277626,0.280842,0.0,0.0,0.0,0.0,0.0,0.0,0.294638,0.0,0.360232,0.306736,0.0,0.0,0.305003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188623,0.0,0.0,0.0,0.0,0.0,0.0,0.365502,0.0,0.0,0.0,0.258097,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391274,0.0,0.0,0.0,0.234289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339605,0.0,0.3966,0.0,0.0,0.0,0.0,0.0,0.0,0.386214,0.0,0.0,0.0,0.0,0.0,0.0,0.338245,0.0,0.0,0.0,0.307374,0.0,0.0,0.0,0.0,0.0,0.0,0.320341,0.0,0.0
2,0.0,0.255545,0.449306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300862,0.0,0.0,0.0,0.0,0.0,0.0,0.519191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610527,0.0,0.0,0.0,0.0,0.0
3,0.0,0.251864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315059,0.398691,0.0,0.0,0.0,0.0,0.0,0.425053,0.0,0.0,0.0,0.452408,0.0,0.308784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427736,0.0,0.0,0.0,0.0,0.0,0.0,0.261784,0.331274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.51314,0.0,0.405411,0.0,0.0,0.0,0.256082,0.0,0.0,0.0,0.0,0.0,0.0,0.246387,0.0,0.0,0.0,0.0,0.0,0.291226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tfidf_df.describe()

Unnamed: 0,accommodation,apartment,area,arrival,arrival automate,arrival automate post,attentive,automate,automate post,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,highly recommend,home,host,host cancel,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
count,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0,17202.0
mean,0.026393,0.117402,0.032704,0.026332,0.018542,0.018542,0.031827,0.018628,0.018542,0.01876,0.024753,0.022874,0.019452,0.034803,0.027243,0.025686,0.068659,0.042293,0.053463,0.020901,0.023848,0.022344,0.022109,0.047122,0.021791,0.025854,0.029173,0.032772,0.094773,0.091884,0.021346,0.024543,0.029053,0.028084,0.025208,0.068617,0.017893,0.039694,0.027476,0.017675,0.024357,0.050012,0.083178,0.021072,0.0324,0.040462,0.021588,0.025013,0.06638,0.02176,0.05455,0.075691,0.018713,0.032394,0.070013,0.023313,0.021788,0.047092,0.022501,0.019585,0.020925,0.081992,0.030486,0.040554,0.038179,0.033371,0.024307
std,0.11731,0.16666,0.109288,0.090274,0.078706,0.078706,0.114388,0.078909,0.078706,0.083854,0.101838,0.093374,0.081565,0.111302,0.10854,0.098683,0.132884,0.118124,0.128571,0.094572,0.101233,0.092112,0.094982,0.148945,0.096246,0.095964,0.115649,0.114784,0.185225,0.181281,0.09214,0.098297,0.101061,0.099713,0.099904,0.128876,0.077796,0.132873,0.106422,0.079537,0.097991,0.132852,0.151294,0.090264,0.101834,0.115209,0.091026,0.102819,0.153817,0.096592,0.148706,0.161217,0.078912,0.106549,0.137415,0.103049,0.084627,0.13391,0.097254,0.089861,0.08827,0.151163,0.114146,0.126052,0.115587,0.103358,0.100643
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.231093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153738,0.137443,0.0,0.0,0.0,0.0,0.0,0.135618,0.0,0.0,0.0,0.0,0.0,0.0,0.162616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115307,0.0,0.0,0.0,0.0,0.0,0.0,0.156732,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,0.871878,0.384791,0.384791,1.0,0.567139,0.384791,1.0,1.0,1.0,0.969156,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.809708,0.641723,1.0,1.0,0.658923,1.0,1.0,1.0,1.0,1.0,1.0,0.859084,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.670646,1.0,1.0,1.0,0.782914,1.0,1.0,0.901195,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
data=pd.concat([total, tfidf_df], axis=1).drop(columns=['id_x', 'comments'])

data.fillna(0, inplace=True)

data.head()

Unnamed: 0,price,lang,accommodation,apartment,area,arrival,arrival automate,arrival automate post,attentive,automate,automate post,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,highly recommend,home,host,host cancel,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
0,70.0,en,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285344,0.29416,0.0,0.0,0.0,0.0,0.0,0.277626,0.280842,0.0,0.0,0.0,0.0,0.0,0.0,0.294638,0.0,0.360232,0.306736,0.0,0.0,0.305003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188623,0.0,0.0,0.0,0.0,0.0,0.0,0.365502,0.0,0.0,0.0,0.258097,0.0
1,17.0,es,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391274,0.0,0.0,0.0,0.234289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339605,0.0,0.3966,0.0,0.0,0.0,0.0,0.0,0.0,0.386214,0.0,0.0,0.0,0.0,0.0,0.0,0.338245,0.0,0.0,0.0,0.307374,0.0,0.0,0.0,0.0,0.0,0.0,0.320341,0.0,0.0
2,80.0,en,0.0,0.255545,0.449306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300862,0.0,0.0,0.0,0.0,0.0,0.0,0.519191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610527,0.0,0.0,0.0,0.0,0.0
3,115.0,en,0.0,0.251864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315059,0.398691,0.0,0.0,0.0,0.0,0.0,0.425053,0.0,0.0,0.0,0.452408,0.0,0.308784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25.0,en,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427736,0.0,0.0,0.0,0.0,0.0,0.0,0.261784,0.331274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.51314,0.0,0.405411,0.0,0.0,0.0,0.256082,0.0,0.0,0.0,0.0,0.0,0.0,0.246387,0.0,0.0,0.0,0.0,0.0,0.291226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [26]:
X=data.drop(columns=['price', 'lang'])

y=data.price

In [27]:
X_train, X_test, y_train, y_test=tts(X, y)

X_train.head()

Unnamed: 0,accommodation,apartment,area,arrival,arrival automate,arrival automate post,attentive,automate,automate post,bathroom,beautiful,bed,cancel,center,central,city,clean,close,comfortable,communication,cozy,definitely,easy,excellent,experience,feel,flat,friendly,good,great,help,helpful,highly,highly recommend,home,host,host cancel,house,kind,kitchen,like,locate,location,lot,metro,need,neighborhood,new,nice,people,perfect,place,post,quiet,recommend,repeat,restaurant,room,small,spacious,station,stay,super,thank,time,walk,welcome
9448,0.0,0.168325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33034,0.0,0.0,0.0,0.32486,0.0,0.0,0.266452,0.0,0.0,0.0,0.0,0.0,0.28407,0.0,0.0,0.323654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198175,0.0,0.0,0.539749,0.0,0.0,0.0,0.0,0.256396,0.0,0.0,0.0,0.0,0.0,0.322834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8121,0.0,0.247695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425606,0.0,0.468671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469818,0.0,0.0,0.377498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.417877,0.0
17121,0.0,0.361808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305299,0.0,0.0,0.0,0.324947,0.0,0.0,0.363848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290042,0.0,0.353306,0.0,0.0,0.0,0.237926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299532,0.299147,0.305196,0.0
11667,0.0,0.251924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315135,0.0,0.363007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30827,0.0,0.441639,0.477841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310609,0.0,0.0,0.0,0.0,0.0,0.0,0.300939,0.0,0.0,0.0,0.0,0.0
14558,0.0,0.241439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.347898,0.0,0.0,0.0,0.0,0.40746,0.0,0.0,0.0,0.433683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.412272,0.387098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7f88c7ae3fd0>

In [29]:
y_pred=modelo.predict(X_test)

In [30]:
print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

RMSE: 348.06956647211814
MAE: 80.61660122510466
R2: -0.1598777292192033


In [31]:
dict(zip(X.columns, modelo.feature_importances_))   

{'accommodation': 0.033900967900881165,
 'apartment': 1.5762995867768077,
 'area': 0.09445145467762729,
 'arrival': 0.03958272697086324,
 'arrival automate': 0.0,
 'arrival automate post': 0.0,
 'attentive': 0.23861164190098255,
 'automate': 0.004365279060863718,
 'automate post': 0.0,
 'bathroom': 0.037272187812747944,
 'beautiful': 0.1103009322486174,
 'bed': 0.1435448378701375,
 'cancel': 0.02084866608534267,
 'center': 2.3102897877923474,
 'central': 2.1179133294443666,
 'city': 0.05498135226246881,
 'clean': 1.1415153224535226,
 'close': 3.2959858503994,
 'comfortable': 0.35147270722525326,
 'communication': 0.8251756991167004,
 'cozy': 0.028696759290856074,
 'definitely': 0.4525679678160304,
 'easy': 3.2260153260718654,
 'excellent': 5.11163124751083,
 'experience': 1.5030442697176853,
 'feel': 0.04390082705771135,
 'flat': 0.47918658780359025,
 'friendly': 0.18392028620866666,
 'good': 2.344901951968637,
 'great': 2.238147096161271,
 'help': 1.1427697903918328,
 'helpful': 0.017

De nuevo, el hecho de usar solo la primera review me lleva al subajuste(underfitting) del modelo. Habría que rehacerlo con todas las reviews y comprobar su performance. Además se debería probar si estos vectores junto con datos tabulares que teniamos mejoran la explicabilidad del precio.