* Aplicar transformaciones y dejar el Dataset para entrenar solamente

In [1]:
import findspark
import pandas as pd
import nltk
import re

from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import ngrams
from optimus import Optimus
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from unicodedata import normalize


Using TensorFlow backend.


#### Start Spark

In [2]:
findspark.init()
spark = (
    SparkSession.builder
    .appName('meetUp_optimus')
    .getOrCreate()
)

#### Start Optimus 

In [3]:
op = Optimus()

#### Read csv

In [4]:
df_items_meli = op.read.csv(
    '../data/train.csv',
    header=True
)

#### Download nltk stopwords, punkt, wordnet

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#### Variables y constantes

In [6]:
stop_words = set(stopwords.words('spanish'))
wordnet_lemmatizer = WordNetLemmatizer()

#### 

filter dataframe

In [7]:
df_items_meli_tiny = df_items_meli.limit(200)

In [8]:
def normalizer(title, n_gram):
    #filter
    only_letters =  re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", 
        r"\1",
        normalize( "NFD", title), 0, re.I
        ) 
    only_letters = re.sub(r'[^\w]', ' ', only_letters)
    #tokenize
    tokens = nltk.word_tokenize(only_letters)
    lower_case = [l.lower() for l in tokens]
    #stop_words
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    
    #create n grams
    if n_gram > 1:
        grams = [''.join(word for word in tupla) for tupla in list(ngrams(lemmas, n_gram))]
    else:
        grams = []
    #union lemmas and grams
    lemmas_ngrams = lemmas + grams
    #list to str
    text = ' '.join(str(text) for text in lemmas_ngrams)
    return text

In [None]:
# float type output
normalizer_udf_str = udf(
    lambda x: normalizer(x, n_gram=2), StringType()
)

In [None]:
df_items_meli_tiny = (
    df_items_meli_tiny.select(
        '*',
        normalizer_udf_str('title').alias('title_clean_str'),
    )
)

In [None]:
df_items_meli_tiny.table(limit=10)

title  1 (string)  nullable,label_quality  2 (string)  nullable,language  3 (string)  nullable,category  4 (string)  nullable,title_clean_str  5 (string)  nullable
Hidrolavadora⋅Lavor⋅One⋅120⋅Bar⋅1700w⋅⋅Bomba⋅Aluminio⋅Italia,unreliable,spanish,ELECTRIC_PRESSURE_WASHERS,hidrolavadora⋅lavor⋅one⋅120⋅bar⋅1700w⋅bomba⋅aluminio⋅italia⋅hidrolavado...
Placa⋅De⋅Sonido⋅-⋅Behringer⋅Umc22,unreliable,spanish,SOUND_CARDS,placa⋅sonido⋅behringer⋅umc22⋅placasonido⋅sonidobehringer⋅behringerumc22
Maquina⋅De⋅Lavar⋅Electrolux⋅12⋅Kilos,unreliable,portuguese,WASHING_MACHINES,maquina⋅lavar⋅electrolux⋅12⋅kilo⋅maquinalavar⋅lavarelectrolux⋅electrolux12⋅12kilo
Par⋅Disco⋅De⋅Freio⋅Diant⋅Vent⋅Gol⋅8v⋅08/⋅Fremax⋅Bd5298,unreliable,portuguese,VEHICLE_BRAKE_DISCS,par⋅disco⋅freio⋅diant⋅vent⋅gol⋅8v⋅08⋅fremax⋅bd5298⋅pardisco⋅...
Flashes⋅Led⋅Pestañas⋅Luminoso⋅Falso⋅Pestañas⋅Para⋅Partido⋅,unreliable,spanish,FALSE_EYELASHES,flash⋅led⋅pestan⋅a⋅luminoso⋅falso⋅pestan⋅a⋅partido⋅flashled⋅ledpe...
"4⋅Microaspersor⋅Irrigação⋅Ultra⋅7,20⋅Metros",unreliable,portuguese,IRRIGATION_SPRINKLERS,4⋅microaspersor⋅irrigacao⋅ultra⋅7⋅20⋅metro⋅4microaspersor⋅microaspersorirriga...
Raquete⋅Clash⋅100⋅Tour⋅-⋅Nova,unreliable,portuguese,RACQUETS,raquete⋅clash⋅100⋅tour⋅nova⋅raqueteclash⋅clash100⋅100tour⋅tournova
"Kit⋅Tripe⋅Para⋅Celular⋅Ou⋅Câmera⋅Fotog⋅1,20m⋅+⋅Brinde⋅+⋅Nf-e",unreliable,portuguese,CAMERA_TRIPODS,kit⋅tripe⋅celular⋅ou⋅camera⋅fotog⋅1⋅20m⋅brinde⋅nf⋅kittripe⋅...
Filtro⋅Ar⋅Bonanza⋅1984/1990⋅Sar3589,unreliable,portuguese,AUTOMOTIVE_AIR_FILTERS,filtro⋅ar⋅bonanza⋅1984⋅1990⋅sar3589⋅filtroar⋅arbonanza⋅bonanza1984⋅1984...
Gatito⋅Lunchera⋅Neoprene⋅,unreliable,spanish,LUNCHBOXES,gatito⋅lunchera⋅neoprene⋅gatitolunchera⋅luncheraneoprene


In [None]:
df_pandas = df_items_meli_tiny.select('title_clean_str', 'category').toPandas()
title_values = df_pandas.loc[:, 'title_clean_str']
y_category = df_pandas.loc[:, 'category']

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
tokenizer = Tokenizer(inputCol="title_clean_str", outputCol="words")
wordsData = tokenizer.transform(df_items_meli_tiny)

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(wordsData)

In [None]:
from pyspark.sql.functions import col

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="rawFeatures_IDF")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

Label to numeric column

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="category", outputCol="label")
rescaledData = indexer.fit(rescaledData).transform(rescaledData)

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["rawFeatures_IDF"],
    outputCol="features")

transformed = assembler.transform(rescaledData)

In [None]:
transformed.table(10)

title  1 (string)  nullable,label_quality  2 (string)  nullable,language  3 (string)  nullable,category  4 (string)  nullable,title_clean_str  5 (string)  nullable,words  6 (array<string>)  nullable,rawFeatures  7 (vector)  nullable,rawFeatures_IDF  8 (vector)  nullable,label  9 (double)  not nullable,features  10 (vector)  nullable
Hidrolavadora⋅Lavor⋅One⋅120⋅Bar⋅1700w⋅⋅Bomba⋅Aluminio⋅Italia,unreliable,spanish,ELECTRIC_PRESSURE_WASHERS,hidrolavadora⋅lavor⋅one⋅120⋅bar⋅1700w⋅bomba⋅aluminio⋅italia⋅hidrolavado...,"['hidrolavadora',⋅'lavor',⋅'one',⋅'120',⋅'bar',⋅'1700w',⋅'bomba',⋅'aluminio',⋅...",,,130.0,
Placa⋅De⋅Sonido⋅-⋅Behringer⋅Umc22,unreliable,spanish,SOUND_CARDS,placa⋅sonido⋅behringer⋅umc22⋅placasonido⋅sonidobehringer⋅behringerumc22,"['placa',⋅'sonido',⋅'behringer',⋅'umc22',⋅'placasonido',⋅'sonidobehringer',⋅'behringerumc22']",,,87.0,
Maquina⋅De⋅Lavar⋅Electrolux⋅12⋅Kilos,unreliable,portuguese,WASHING_MACHINES,maquina⋅lavar⋅electrolux⋅12⋅kilo⋅maquinalavar⋅lavarelectrolux⋅electrolux12⋅12kilo,"['maquina',⋅'lavar',⋅'electrolux',⋅'12',⋅'kilo',⋅'maquinalavar',⋅'lavarelectrolux',⋅...",,,13.0,
Par⋅Disco⋅De⋅Freio⋅Diant⋅Vent⋅Gol⋅8v⋅08/⋅Fremax⋅Bd5298,unreliable,portuguese,VEHICLE_BRAKE_DISCS,par⋅disco⋅freio⋅diant⋅vent⋅gol⋅8v⋅08⋅fremax⋅bd5298⋅pardisco⋅...,"['par',⋅'disco',⋅'freio',⋅'diant',⋅'vent',⋅'gol',⋅'8v',⋅'08',⋅'fremax',⋅...",,,71.0,
Flashes⋅Led⋅Pestañas⋅Luminoso⋅Falso⋅Pestañas⋅Para⋅Partido⋅,unreliable,spanish,FALSE_EYELASHES,flash⋅led⋅pestan⋅a⋅luminoso⋅falso⋅pestan⋅a⋅partido⋅flashled⋅ledpe...,"['flash',⋅'led',⋅'pestan',⋅'a',⋅'luminoso',⋅'falso',⋅'pestan',⋅'a',⋅'partido'...",,,24.0,
"4⋅Microaspersor⋅Irrigação⋅Ultra⋅7,20⋅Metros",unreliable,portuguese,IRRIGATION_SPRINKLERS,4⋅microaspersor⋅irrigacao⋅ultra⋅7⋅20⋅metro⋅4microaspersor⋅microaspersorirriga...,"['4',⋅'microaspersor',⋅'irrigacao',⋅'ultra',⋅'7',⋅'20',⋅'metro',⋅'4microaspersor',&...",,,18.0,
Raquete⋅Clash⋅100⋅Tour⋅-⋅Nova,unreliable,portuguese,RACQUETS,raquete⋅clash⋅100⋅tour⋅nova⋅raqueteclash⋅clash100⋅100tour⋅tournova,"['raquete',⋅'clash',⋅'100',⋅'tour',⋅'nova',⋅'raqueteclash',⋅'clash100',⋅'100tour',&...",,,125.0,
"Kit⋅Tripe⋅Para⋅Celular⋅Ou⋅Câmera⋅Fotog⋅1,20m⋅+⋅Brinde⋅+⋅Nf-e",unreliable,portuguese,CAMERA_TRIPODS,kit⋅tripe⋅celular⋅ou⋅camera⋅fotog⋅1⋅20m⋅brinde⋅nf⋅kittripe⋅...,"['kit',⋅'tripe',⋅'celular',⋅'ou',⋅'camera',⋅'fotog',⋅'1',⋅'20m',⋅'brinde',...",,,76.0,
Filtro⋅Ar⋅Bonanza⋅1984/1990⋅Sar3589,unreliable,portuguese,AUTOMOTIVE_AIR_FILTERS,filtro⋅ar⋅bonanza⋅1984⋅1990⋅sar3589⋅filtroar⋅arbonanza⋅bonanza1984⋅1984...,"['filtro',⋅'ar',⋅'bonanza',⋅'1984',⋅'1990',⋅'sar3589',⋅'filtroar',⋅'arbonanza',Y...",,,168.0,
Gatito⋅Lunchera⋅Neoprene⋅,unreliable,spanish,LUNCHBOXES,gatito⋅lunchera⋅neoprene⋅gatitolunchera⋅luncheraneoprene,"['gatito',⋅'lunchera',⋅'neoprene',⋅'gatitolunchera',⋅'luncheraneoprene']",,,19.0,


In [None]:
from pyspark.ml.classification import LogisticRegression

lr_classifier = LogisticRegression(family="multinomial")
lr_classifier.fit(transformed)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 35544)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)


In [None]:
from pyspark.ml.classification import GBTClassifier

model = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
model.fit(transformed)