* Aplicar transformaciones y dejar el Dataset para entrenar solamente

In [None]:
import findspark
import pandas as pd
import nltk
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import ngrams
from optimus import Optimus
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from unicodedata import normalize


#### Start Spark

In [None]:
findspark.init()
spark = SparkSession.builder.appName('meetUp_optimus').getOrCreate()

#### Start Optimus 

In [None]:
op = Optimus()

#### Read csv

In [4]:
df_items_meli = op.read.csv(
    '../data/train.csv',
    header=True
)

#### Download nltk helpers 

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('spanish'))
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
df_items_meli_tiny = df_items_meli.limit(2500)

In [42]:
def normalizer(title, n_gram):
    #filter
    only_letters =  re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", 
        r"\1",
        normalize( "NFD", title), 0, re.I
        ) 
    only_letters = re.sub(r'[^\w]', ' ', only_letters)
    #tokenize
    tokens = nltk.word_tokenize(only_letters)
    lower_case = [l.lower() for l in tokens]
    #stop_words
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    
    #create n grams
    if n_gram > 1:
        grams = [''.join(word for word in tupla) for tupla in list(ngrams(lemmas, n_gram))]
    else:
        grams = []
    #union lemmas and grams
    lemmas_ngrams = lemmas + grams
    #list to str
    text = ' '.join(str(text) for text in lemmas_ngrams)
    return text

In [30]:
# float type output
normalizer_udf_str = udf(
    lambda x: normalizer(x, n_gram=2), StringType()
)

In [39]:
(
    df_items_meli_tiny.select(
        'title',
        normalizer_udf_str('title').alias('title_clean_str')
    ).table()
)

title  1 (string)  nullable,title_clean_str  2 (string)  nullable
Hidrolavadora⋅Lavor⋅One⋅120⋅Bar⋅1700w⋅⋅Bomba⋅Aluminio⋅Italia,hidrolavadora⋅lavor⋅one⋅120⋅bar⋅1700w⋅bomba⋅aluminio⋅italia⋅hidrolavado...
Placa⋅De⋅Sonido⋅-⋅Behringer⋅Umc22,placa⋅sonido⋅behringer⋅umc22⋅placasonido⋅sonidobehringer⋅behringerumc22
Maquina⋅De⋅Lavar⋅Electrolux⋅12⋅Kilos,maquina⋅lavar⋅electrolux⋅12⋅kilo⋅maquinalavar⋅lavarelectrolux⋅electrolux12⋅12kilo
Par⋅Disco⋅De⋅Freio⋅Diant⋅Vent⋅Gol⋅8v⋅08/⋅Fremax⋅Bd5298,par⋅disco⋅freio⋅diant⋅vent⋅gol⋅8v⋅08⋅fremax⋅bd5298⋅pardisco⋅...
Flashes⋅Led⋅Pestañas⋅Luminoso⋅Falso⋅Pestañas⋅Para⋅Partido⋅,flash⋅led⋅pestan⋅a⋅luminoso⋅falso⋅pestan⋅a⋅partido⋅flashled⋅ledpe...
"4⋅Microaspersor⋅Irrigação⋅Ultra⋅7,20⋅Metros",4⋅microaspersor⋅irrigacao⋅ultra⋅7⋅20⋅metro⋅4microaspersor⋅microaspersorirriga...
Raquete⋅Clash⋅100⋅Tour⋅-⋅Nova,raquete⋅clash⋅100⋅tour⋅nova⋅raqueteclash⋅clash100⋅100tour⋅tournova
"Kit⋅Tripe⋅Para⋅Celular⋅Ou⋅Câmera⋅Fotog⋅1,20m⋅+⋅Brinde⋅+⋅Nf-e",kit⋅tripe⋅celular⋅ou⋅camera⋅fotog⋅1⋅20m⋅brinde⋅nf⋅kittripe⋅...
Filtro⋅Ar⋅Bonanza⋅1984/1990⋅Sar3589,filtro⋅ar⋅bonanza⋅1984⋅1990⋅sar3589⋅filtroar⋅arbonanza⋅bonanza1984⋅1984...
Gatito⋅Lunchera⋅Neoprene⋅,gatito⋅lunchera⋅neoprene⋅gatitolunchera⋅luncheraneoprene
