# NLP applied to the descriptions

#### The description column might be important when predicting the price so in order to add more useful information to our dataset, we decided to apply NLP on the description column to obtain the BoW, that will be use later.

#### We will be working with spanish text, since we have previously translated all description to Spanish.

In [9]:
import pandas as pd
import unicodedata
import numpy as np
from tqdm import tqdm
import pickle
import pymongo
from pymongo import MongoClient
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import webtext
from nltk.probability import FreqDist
from functools import reduce
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

#### We need to define some functions to clean the texts:

In [2]:
#This one is to remove the accents
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

In [3]:
#This one is to clean the text removing the punctuation marks, extra spaces, numbers, stopwords and to lemmatize the words
def pre_processing_es(text):
    text = text.str.lower()
    
    deletePunct = re.compile(r'[\.,_:€¡!*?¿;\-"\(\)\[\]\{\}%\'�=$&+æ<></ \>#@ \ ]')
    text = text.apply(lambda x: deletePunct.sub(" ", x).strip())
    
    delete2Spaces = re.compile(r'\s{2,}')
    text = text.apply(lambda x: delete2Spaces.sub(" ", x))
    
    deleteNumbers = re.compile(r'[0-9]')
    text = text.apply(lambda x: deleteNumbers.sub("", x))
    
    stop = stopwords.words('spanish')
    text = text.apply(lambda x: " ".join([i for i in x.split() if i not in stop]))
    
    text = text.apply(lambda x: strip_accents(x))
    
    wordnet_lemmatizer = WordNetLemmatizer()
    text = text.apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(i) for i in x.split()]))    
    
    return text

#### In our case, we have uploaded all the data in MongoDB, so we will create now a table by retrieving the data we need:

In [10]:
client = MongoClient('localhost', 27017)
db = client.proyecto
collection = db.alojamientos
descrip = pd.DataFrame(list(collection.find({ "traduccion" : {"$exists" : True }})))
datos = descrip[["id", "traduccion", "price", "room_type"]]

In [11]:
datos.tail()

Unnamed: 0,id,traduccion,price,room_type
19215,43430853,Este apartamento rezuma exclusividad en todas ...,85.0,Entire home/apt
19216,43430860,Apartamento de un dormitorio en Avda. de Améri...,43.0,Entire home/apt
19217,43431846,Impresionante vivienda exclusiva en pleno cent...,174.0,Entire home/apt
19218,43437149,APARTAMENTO CON DOS HABITACIONES Y DOS BAÑOS. ...,80.0,Entire home/apt
19219,43440208,Los tabiques de Santa Ana I se han reducido a ...,64.0,Entire home/apt


#### Now it's time to pre-process the texts and remove those that are empty:

In [12]:
datos["traduccion"] = pre_processing_es(datos["traduccion"])
#Remove those with no text
datos["traduccion"].replace('', np.nan, inplace=True)
datos.dropna(subset=["traduccion"], inplace=True)

#### We are going to use scacy now to tokenize the texts:

In [13]:
import spacy
# import spacy.cli
# spacy.cli.download("es_core_news_lg")
nlp = spacy.load("es_core_news_lg")

datos.reset_index(inplace = True)
columna = []
for frase in tqdm(range(datos["traduccion"].shape[0])):
    lista = []
    tok = nlp(datos.loc[frase,"traduccion"])
    for token in tok:
        #if (token.head.pos_ == "NOUN") | (token.head.pos_ == "ADJ") | (token.head.pos_ == "VERB"):
        if (token.head.pos_ == "NOUN") | (token.head.pos_ == "ADJ"):    
            lista.append(token.text)
    columna.append(lista)
datos["Words"] = columna

100%|████████████████████████████████████████████████████████████████████████████| 18769/18769 [06:32<00:00, 47.80it/s]


In [14]:
for i in range(datos["traduccion"].shape[0]):
    if datos.loc[i, "Words"] == []:
        datos.drop(index = i, inplace = True)

datos.reset_index(inplace = True, drop = True)

#### To create the BoW we need first a table with the words that appear in the column and their frequence:

In [15]:
import gensim
from gensim.corpora import Dictionary
dictionary = gensim.corpora.Dictionary(datos["Words"])
bow_corpus = [dictionary.doc2bow(doc) for doc in datos["Words"]]
vocab = list(dictionary.values())
vocab_tf = [dict(i) for i in bow_corpus]
vocab_tf = list(pd.DataFrame(vocab_tf).sum(axis=0))
frecuencias = pd.DataFrame({"palabra" : vocab, "freq" : vocab_tf})

#### There are some words that can be removed based on the fact that they don't provide any useful information when giving an apartment or a room a price.
#### Let's see the most frequent words and analyze which ones can be removed:

In [17]:
comunes = frecuencias.sort_values("freq", ascending=False)[:100]
print(comunes)

         palabra    freq
32         metro  7061.0
56   apartamento  6989.0
78        madrid  6955.0
65            do  6919.0
12          cama  5503.0
..           ...     ...
524      privado   980.0
292    ubicacion   976.0
231         mesa   962.0
252     completo   951.0
481       planta   936.0

[100 rows x 2 columns]


In [18]:
#Some of them can be removed:
borrar = ["madrid", "do", "barrio", "zona", "plaza", "ciudad", "persona", "puede", "cuenta", 
          "si", "estancia", "encuentra", "cualquier", "dispone", "m", "puedes", "pueden", "etc"]

In [19]:
for i in range(frecuencias.shape[0]):
    if frecuencias.loc[i, "palabra"] in borrar:
        frecuencias.drop(index = i, inplace = True)
frecuencias.reset_index(inplace = True, drop = True)

#### We can also remove the less frequent words:

In [20]:
for i in range(frecuencias.shape[0]):
    if frecuencias.loc[i, "freq"] < 50:
        frecuencias.drop(index = i, inplace = True)

frecuencias.reset_index(inplace = True, drop = True)

#### Let's create the BoW:

In [21]:
BoWMatrix = pd.DataFrame(np.zeros((datos.shape[0], len(frecuencias)), dtype=np.uint8), columns=frecuencias.palabra)
for row in tqdm(datos.iterrows()):
    index = row[0]
    tokens = row[1]["Words"]
    for tok in tokens:
        if tok in list(frecuencias["palabra"]):
            BoWMatrix.loc[index, tok] = 1
        else:
            pass

16959it [02:50, 99.66it/s] 


In [22]:
BoWMatrix.head()

palabra,acogedor,ambiente,autobus,banos,bares,bien,bonito,buena,cafeteria,cama,...,ponzano,diurno,sujeto,establecimiento,cerradas,encima,tale,registros,taquillas,hostel
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### To merge this matrix with the table containing all the information, we need to have the id of the apartment/room:

In [23]:
BoW = pd.concat([datos[["id"]], BoWMatrix], axis=1)

In [24]:
BoW.head()

Unnamed: 0,id,acogedor,ambiente,autobus,banos,bares,bien,bonito,buena,cafeteria,...,ponzano,diurno,sujeto,establecimiento,cerradas,encima,tale,registros,taquillas,hostel
0,21853,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,23001,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26825,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28200,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Now the BoW is ready to be merged with the dataset. This will help to improve the price prediction.