# 4.1 - TFIDF

Voy a intentar una transformación tfidf y usar solamente esos vectores como datos para entrenar un modelo.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

from langdetect import detect
from googletrans import Translator

import re

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
listings=pd.read_csv('../data/raw_data/listings.csv.gz', compression='gzip', low_memory=False)


for c in listings.select_dtypes(include='int'):
    listings[c]=pd.to_numeric(listings[c], downcast='integer')

for c in listings.select_dtypes(include='float'):
    listings[c]=pd.to_numeric(listings[c], downcast='float')
    
listings.info(memory_usage='deep')

In [None]:
reviews=pd.read_csv('../data/raw_data/reviews.csv.gz', compression='gzip', low_memory=False)

for c in reviews.select_dtypes(include='int'):
    reviews[c]=pd.to_numeric(reviews[c], downcast='integer')

reviews.info(memory_usage='deep')

In [None]:
primera=reviews.groupby('listing_id').first().reset_index()

primera.head()

In [None]:
total=listings.merge(primera, left_on='id', right_on='listing_id')[['id_x', 'price', 'comments']]

total=total.dropna()

total=dd.from_pandas(total, npartitions=5)

total.head()

In [None]:
total.price=total.price.apply(lambda x: float(x.replace('$', '').replace(',', '')))

total.info()

In [None]:
total.head()

In [None]:
%%time
def detect_lang(x):
    try:
        return detect(x)
    except:
        return 'unknown'

total['lang']=total.comments.apply(detect_lang)

total.head()

In [None]:
# voy a traducir al ingles 

total.lang.value_counts()

In [None]:
nlp=spacy.load('en_core_web_sm')   # modelo preentrenado, ingles

# stopwords
stop_words_en=set(stopwords.words('english') + list(STOP_WORDS) + list(nlp.Defaults.stop_words) + ['edit', 'plot'])

stop_words=stop_words_en 

In [None]:
%%time

total.comments=total.comments.apply(lambda x: Translator().translate(x, dest='en').text)

In [None]:
%%time

# elimino las entidades nombradas, mi intención es usar solo adjetivos

total.comments=total.comments.apply(lambda x: ' '.join([ent.text for ent in nlp(x) if not ent.ent_type_]))

In [None]:
def spacy_tokenizer(frase):

    tokens=word_tokenize(frase)  # lista de palabras
    
    filtrado=[]
    
    for palabra in tokens:
        
        lemma=nlp(str(palabra))[0].lemma_.lower().strip()
        
        if lemma not in stop_words and re.search('^[a-zA-Z]+$', lemma) and len(lemma)>2:
            filtrado.append(lemma)
            
    return filtrado

In [None]:
spacy_tokenizer(total.comments[1])

In [None]:
total.comments[1]

In [None]:
tfidf=TfidfVectorizer(min_df=0.05, 
                      tokenizer=spacy_tokenizer, 
                      #stop_words=stop_words,
                      #ngram_range=(1, 1)
                     )

In [None]:
%%time

tfidf_matrix=tfidf.fit_transform(total.comments)

In [None]:
tfidf_matrix.shape

In [None]:
terms=tfidf.get_feature_names_out()

terms[:15], len(terms)

In [None]:
tfidf_df=pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

tfidf_df.head()

In [None]:
tfidf_df.describe()

In [None]:
data=pd.concat([total, tfidf_df], axis=1).drop(columns=['id_x', 'comments'])

data.fillna(0, inplace=True)

data.head()

In [None]:
from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

In [None]:
X=data.drop('price', axis=1)

y=data.price

In [None]:
X_train, X_test, y_train, y_test=tts(X, y)

X_train.head()

In [None]:
modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

In [None]:
y_pred=modelo.predict(X_test)

In [None]:
print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

In [None]:
dict(zip(X.columns, modelo.feature_importances_))   

In [None]:
X_train, X_test, y_train, y_test=tts(X, y)

modelo=CTR(verbose=0)

modelo.fit(X_train, y_train)

y_pred=modelo.predict(X_test)

print(f'RMSE: {mse(y_test, y_pred, squared=False)}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2(y_test, y_pred)}')

In [None]:
# top 8 caracteristicas

carac={k:v for k,v in zip(X.columns, modelo.feature_importances_) if v>7.5}

carac

De nuevo, el hecho de usar solo la primera review me lleva al subajuste(underfitting) del modelo. Habría que rehacerlo con todas las reviews y comprobar su performance. Además se debería probar si estos vectores junto con datos tabulares que teniamos mejoran la explicabilidad del precio.