# 4.1 - TFIDF

Voy a intentar una transformación tfidf y usar solamente esos vectores como datos para entrenar un modelo.

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

from langdetect import detect
from googletrans import Translator

import re

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /Users/iudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
listings=pd.read_csv('../data/raw_data/listings.csv.gz', compression='gzip', low_memory=False)
    
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21495 entries, 0 to 21494
Columns: 106 entries, id to reviews_per_month
dtypes: float64(23), int64(21), object(62)
memory usage: 163.8 MB


In [3]:
reviews=pd.read_csv('../data/raw_data/reviews.csv.gz', compression='gzip', low_memory=False)

for c in reviews.select_dtypes(include='int'):
    reviews[c]=pd.to_numeric(reviews[c], downcast='integer')

reviews.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812726 entries, 0 to 812725
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     812726 non-null  int32 
 1   id             812726 non-null  int32 
 2   date           812726 non-null  object
 3   reviewer_id    812726 non-null  int32 
 4   reviewer_name  812725 non-null  object
 5   comments       812351 non-null  object
dtypes: int32(3), object(3)
memory usage: 373.0 MB


In [4]:
primera=reviews.groupby('listing_id').first().reset_index()

primera.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6369,29428,2010-03-14,84790,Nancy,Simon and Arturo have the ultimate location in...
1,21853,21051116,2014-10-10,8506071,Pedro Abel,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,23330835,2014-11-29,9091591,Berk,"During my stay, I enjoyed all around and had a..."
3,24836,294356,2011-06-02,18734,Albert,Incredible location! Tenty and Goyo were very...
4,26825,41524,2010-05-10,45915,Cybill,"Agustina is a great host, she is very thoughtf..."


In [5]:
total=listings.merge(primera, left_on='id', right_on='listing_id')[['id_x', 'price', 'comments']]

total=total.dropna()

total.head()

Unnamed: 0,id_x,price,comments
0,6369,$70.00,Simon and Arturo have the ultimate location in...
1,21853,$17.00,"Mi experiencia en casa de Adel fue buena, aunq..."
2,24805,$80.00,"During my stay, I enjoyed all around and had a..."
3,24836,$115.00,Incredible location! Tenty and Goyo were very...
4,26825,$25.00,"Agustina is a great host, she is very thoughtf..."


In [6]:
total.price=total.price.apply(lambda x: float(x.replace('$', '').replace(',', '')))

total.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17202 entries, 0 to 17203
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_x      17202 non-null  int64  
 1   price     17202 non-null  float64
 2   comments  17202 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 5.9 MB


In [7]:
%%time

def detect_lang(x: str) -> str:
    """
    Función para detectar el idioma de una string.
    
    param x: string de entrada 
    
    return: string con el idioma (en, es, etc...)
    """
    try:
        return detect(x)
    except:
        return 'unknown'

total['lang']=total.comments.apply(detect_lang)

total.head()

CPU times: user 59.5 s, sys: 424 ms, total: 59.9 s
Wall time: 59.9 s


Unnamed: 0,id_x,price,comments,lang
0,6369,70.0,Simon and Arturo have the ultimate location in...,en
1,21853,17.0,"Mi experiencia en casa de Adel fue buena, aunq...",es
2,24805,80.0,"During my stay, I enjoyed all around and had a...",en
3,24836,115.0,Incredible location! Tenty and Goyo were very...,en
4,26825,25.0,"Agustina is a great host, she is very thoughtf...",en


In [8]:
# voy a traducir al ingles 

total.lang.value_counts()

es         7962
en         7275
fr          794
pt          293
it          229
de          105
zh-cn        68
ru           66
ro           61
ko           61
unknown      53
ca           42
nl           41
af           16
tl           14
so           14
pl           11
id           10
sk           10
ja            8
cs            7
vi            7
da            7
sl            7
tr            6
cy            6
el            5
sv            5
lt            4
sw            3
bg            2
fi            2
zh-tw         1
he            1
uk            1
hu            1
ar            1
lv            1
hr            1
no            1
Name: lang, dtype: int64

In [9]:
nlp=spacy.load('en_core_web_lg')   # modelo preentrenado, ingles

# stopwords
stop_words_en=set(stopwords.words('english') + list(STOP_WORDS) + list(nlp.Defaults.stop_words) + ['edit', 'plot'])

stop_words=stop_words_en 

In [None]:
%%time

total.comments=total.comments.apply(lambda x: Translator().translate(x, dest='en').text)

In [None]:
%%time

# elimino las entidades nombradas, mi intención es usar solo adjetivos

total.comments=total.comments.apply(lambda x: ' '.join([ent.text for ent in nlp(x) if not ent.ent_type_]))

In [None]:
def spacy_tokenizer(frase):

    tokens=word_tokenize(frase)  # lista de palabras
    
    filtrado=[]
    
    for palabra in tokens:
        
        lemma=nlp(str(palabra))[0].lemma_.lower().strip()
        
        if lemma not in stop_words and re.search('^[a-zA-Z]+$', lemma) and len(lemma)>2:
            filtrado.append(lemma)
            
    return filtrado

In [None]:
spacy_tokenizer(total.comments[1])

In [None]:
total.comments[1]

In [None]:
tfidf=TfidfVectorizer(min_df=0.05, 
                      tokenizer=spacy_tokenizer, 
                      ngram_range=(1, 3))

In [None]:
%%time

tfidf_matrix=tfidf.fit_transform(total.comments)

In [None]:
tfidf_matrix.shape

In [None]:
terms=tfidf.get_feature_names_out()

terms[:15], len(terms)

In [None]:
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

tfidf_df.head()

In [None]:
tfidf_df.describe()

In [None]:
data=pd.concat([total, tfidf_df], axis=1).drop(columns=['id_x', 'comments'])

data.fillna(0, inplace=True)

data.head()

In [None]:
from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [None]:
X=data.drop('price', axis=1)

y=data.price

In [None]:
X_train, X_test, y_train, y_test=tts(X, y)

X_train.head()

In [None]:
modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

In [None]:
y_pred=modelo.predict(X_test)

In [None]:
print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

In [None]:
dict(zip(X.columns, modelo.feature_importances_))   

In [None]:
X_train, X_test, y_train, y_test=tts(X, y)

modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

y_pred=modelo.predict(X_test)

print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

In [None]:
# top 8 caracteristicas

carac={k:v for k,v in zip(X.columns, modelo.feature_importances_) if v>7.5}

carac

De nuevo, el hecho de usar solo la primera review me lleva al subajuste(underfitting) del modelo. Habría que rehacerlo con todas las reviews y comprobar su performance. Además se debería probar si estos vectores junto con datos tabulares que teniamos mejoran la explicabilidad del precio.