In [1]:
# importando bibliotecas 
from pathlib import Path
from IPython.display import Markdown
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
import nltk

# importando biblioteca para particionamento do conjunto de dados 
import dask.dataframe as dd

# importando redução de dimensionalidade
from sklearn.decomposition import PCA

# importando stopwords
nltk.download('rslp')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package rslp to /home/joseaurelio/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/joseaurelio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/joseaurelio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# carregamento conjunto de dados
data_path = Path("../data/processed/lemmatization_sem_stopwords.csv")

# carregamento dicionário de dados
dict_path = Path("../data/external/dicionario.csv")

In [3]:
# leitura conjunto de dados
df_data = dd.read_csv(data_path)
df_data.to_csv(data_path, single_file=True)


# visualização dados
display(Markdown("### Dados"))
display(df_data.head())

# leitura dicionário de dados
df_dict = pd.read_csv(dict_path)

# visualização dicionário de dados
display(Markdown("### Dicionário"))
display(df_dict.head())


### Dados

Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,tweet_date,sentiment,query_used,tweet_text
0,0,0,0,0,0,0,1050785521201541121,2018-10-12 13:29:25-03:00,1,:),"['Laranjito76','pessoa','certo','valer','azeve..."
1,1,1,1,1,1,1,1050785431955140608,2018-10-12 13:29:04-03:00,1,:),"['behin_d_curtain','eu','precisamente','contrá..."
2,2,2,2,2,2,2,1050785401248645120,2018-10-12 13:28:56-03:00,1,:),"['Vou','fazer','video','hoje','...','pensar','..."
3,3,3,3,3,3,3,1050785370982547461,2018-10-12 13:28:49-03:00,1,:),"['aaaaaaaar','ameir','tanto','polaroids','sabe..."
4,4,4,4,4,4,4,1050785368902131713,2018-10-12 13:28:49-03:00,1,:),"['Valoriza','coração','menininho','vc','difere..."


### Dicionário

Unnamed: 0,variavel,significado,tipo,valores
0,id,ID único por usuário,useless,
1,tweet_text,Texto publicado,text,
2,tweet_date,Data de publicação,time,
3,sentiment,Algorítmo de classificação do sentimento do us...,nominal,"[0,1,2]"
4,query_used,Palavra relevante,nominal,"[':)', ':(', 'veja', 'jornaloglobo', 'g1', 'fo..."


In [4]:
# tipos de variáveis obtidas do dicionário de dados
df_dict.tipo.unique()

array(['useless', 'text', 'time', 'nominal'], dtype=object)

In [5]:
target_column = "sentiment"
useless_columns =  df_dict.query("tipo == 'useless'").variavel.to_list()
nominal_columns = (
    df_dict
    .query(
        "tipo == 'nominal' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
text_columns = (
    df_dict
    .query(
        "tipo == 'text' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
time_columns = (
    df_dict
    .query(
        "tipo == 'time' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)

In [6]:
nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ("missing", SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ("encoder", OneHotEncoder(sparse=False)), # Codificação de variáveis
    # Seleção de variáveis
    ("normalization", StandardScaler()), # Normalização
    ("pca", PCA())# Redução de dimensionalidade - PCA
])
text_preprocessor = Pipeline([
    ("bag of words", CountVectorizer(max_features=3000, stop_words=stop_words, strip_accents='ascii', 
                      lowercase=True)),
    # Tratamento de dados faltantes
    # Codificação de variáveis
    # Seleção de variáveis
     ("normalization", StandardScaler(with_mean=False)) # Normalização
])
time_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    # Tratamento de dados faltantes
    ("encoder", OrdinalEncoder()),# Codificação de variáveis
    # Seleção de variáveis
    # Normalização
])

In [7]:
nominal_preprocessor

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("nominal", nominal_preprocessor, nominal_columns),
    ("text", text_preprocessor, text_columns),
    ("time", time_preprocessor, time_columns),
])

In [9]:
X = df_data.drop(columns=[*useless_columns, target_column], axis=1)
y = df_data[[target_column]]

ValueError: Metadata inference failed in `drop_by_shallow_copy`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
KeyError('[None] not found in axis')

Traceback:
---------
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/dask/dataframe/core.py", line 6557, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/dask/dataframe/utils.py", line 731, in drop_by_shallow_copy
    df2.drop(columns=columns, inplace=True, errors=errors)
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/util/_decorators.py", line 317, in wrapper
    return func(*args, **kwargs)
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/core/frame.py", line 5391, in drop
    return super().drop(
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/util/_decorators.py", line 317, in wrapper
    return func(*args, **kwargs)
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/core/generic.py", line 4510, in drop
    obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/core/generic.py", line 4551, in _drop_axis
    new_axis = axis.drop(labels, errors=errors)
  File "/home/joseaurelio/.cache/pypoetry/virtualenvs/src-2zHzSJE6-py3.8/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 6972, in drop
    raise KeyError(f"{list(labels[mask])} not found in axis")


In [None]:
pca = PCA (X, y)

In [None]:
preprocessor.fit(X)

In [None]:
preprocessor.transform(X)