In [4]:
# importando bibliotecas 
import pandas as pd
import numpy as np
import nltk
import warnings
import joblib
from pathlib import Path
from IPython.display import Markdown
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier

# ignorando warnings
warnings.filterwarnings('ignore')

# importando stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to /home/madson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# carregamento conjunto de dados
data_path = Path("../data/raw/data.csv")

# carregamento dicionário de dados
dict_path = Path("../data/external/dicionario.csv")

datasets = [
    ("lemmatization_sem_stopwords", Path('../data/processed/lemmatization_sem_stopwords.csv')),
    ("lemmatization_com_stopwords", Path('../data/processed/lemmatization_com_stopwords.csv')),
    ("stemming_com_stopwords", Path('../data/processed/stemming_com_stopwords.csv')),
    ("stemming_sem_stopwords", Path('../data/processed/stemming_sem_stopwords.csv'))
]

In [14]:
df = pd.read_csv(Path('../data/processed/lemmatization_sem_stopwords.csv')).sample(100)

In [6]:
# leitura conjunto de dados
df_data = pd.read_csv(data_path, sep=";").sample(100)

# padronização tweet_date
df_data['tweet_date'] = pd.to_datetime(df_data['tweet_date'])
df_data['tweet_date'] = df_data.tweet_date.dt.tz_convert('Brazil/East')

# visualização dados
display(Markdown("### Dados"))
display(df_data.head())

# leitura dicionário de dados
df_dict = pd.read_csv(dict_path)

# visualização dicionário de dados
display(Markdown("### Dicionário"))
display(df_dict.head())

### Dados

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1050785521201541121,@Laranjito76 A pessoa certa para isso seria o ...,2018-10-12 13:29:25-03:00,1,:)
1,1050785431955140608,"@behin_d_curtain Para mim, é precisamente o co...",2018-10-12 13:29:04-03:00,1,:)
2,1050785401248645120,Vou fazer um video hoje... estou pensando em f...,2018-10-12 13:28:56-03:00,1,:)
3,1050785370982547461,"aaaaaaaa amei tanto essas polaroids, nem sei e...",2018-10-12 13:28:49-03:00,1,:)
4,1050785368902131713,Valoriza o coração do menininho que vc tem. El...,2018-10-12 13:28:49-03:00,1,:)


### Dicionário

Unnamed: 0,variavel,significado,tipo,valores
0,id,ID único por usuário,useless,
1,tweet_text,Texto publicado,text,
2,tweet_date,Data de publicação,time,
3,sentiment,Algorítmo de classificação do sentimento do us...,nominal,"[0,1,2]"
4,query_used,Palavra relevante,nominal,"[':)', ':(', 'veja', 'jornaloglobo', 'g1', 'fo..."


In [7]:
target_column = "sentiment"
useless_columns =  df_dict.query("tipo == 'useless'").variavel.to_list()
nominal_columns = (
    df_dict
    .query(
        "tipo == 'nominal' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
text_columns = (
    df_dict
    .query(
        "tipo == 'text' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
time_columns = (
    df_dict
    .query(
        "tipo == 'time' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)

In [8]:
nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ("missing", SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ("encoder", OneHotEncoder(sparse=False,handle_unknown='ignore')), # Codificação de variáveis
    # Seleção de variáveis
    ("normalization", StandardScaler()), # Normalização
])
text_preprocessor = Pipeline([
    ("bag of words", Pipeline([('count', CountVectorizer(max_features=3000, strip_accents='ascii', lowercase=True)),('tfid', TfidfTransformer())])),
    # Tratamento de dados faltantes
    # Codificação de variáveis
    # Seleção de variáveis
    ("pca", TruncatedSVD(n_components=500)) # Redução de dimensionalidade - PCA
    #("normalization", StandardScaler()) # Normalização
])
time_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    # Tratamento de dados faltantes
    ("encoder", OrdinalEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    # Normalização
])

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ("nominal", nominal_preprocessor, nominal_columns),
    ("text", text_preprocessor, text_columns[0]), # setando a coluna tweet_text
    #("time", time_preprocessor, time_columns),
])

In [10]:
# Definindo parâmetros dos modelos
models = [
    (
        "LR",
        LogisticRegression(solver='liblinear', max_iter=10000),
        {"penalty": ['l1', 'l2']}
    ),
    (
        "KNN",
        KNeighborsClassifier(metric='euclidean'),
        {"n_neighbors": [3, 5, 11, 15]}
    ),
    (
        "SVM",
        SVC(),
        {"kernel": ['linear', 'rbf']}
    ),
    (
        "GB",
        GradientBoostingClassifier(random_state=42),
        {
            'n_estimators': [100, 200, 300], 
            'learning_rate':[0.5,1.0,1.5],
            'max_depth':[1,2,3]
        }
    ),
]

In [11]:
# Definindo métricas
metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

In [12]:
# Definindo scoring para GridSearchCV
scoring_metric = make_scorer(recall_score, average='macro')

In [13]:
# Separando características previssores da classe
#X = df_data.drop(columns=[*useless_columns, target_column], axis=1)


# Definindo cross validation
cv = ShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

In [None]:
# Realizando treinamento dos modelos selecionados

results = {}
for dataset_name, dataset_path in datasets:
    df = pd.read_csv(dataset_path)
    X = df.drop(columns=[*useless_columns, *time_columns, target_column], axis=1)
    y = df[[target_column]].to_numpy().ravel()
    for model_name, model_obj, model_params in models:
        print(f'{model_name} run...')

        model_gs = GridSearchCV(model_obj, model_params, scoring='accuracy')
        approach = Pipeline([
            ("preprocessing", preprocessor),
            ("model", model_gs)
        ])
        model_results = cross_validate(
            approach,
            X=X,
            y=y,
            scoring=metrics,
            cv=cv,
            n_jobs=4
        )
        model_results['name'] = [model_name] * len(model_results['score_time'])
        model_results['dataset'] = [dataset_name] * len(model_results['score_time'])
        if results:
            for key, value in model_results.items():
                results[key] = np.append(results[key], value)
        else:
            results = model_results

LR run...


In [13]:
# Criando dataframe com os resultados
df_results = pd.DataFrame(results)
# df_results.groupby('name').agg([np.mean, np.std])

Unnamed: 0_level_0,fit_time,fit_time,score_time,score_time,test_accuracy,test_accuracy,test_precision,test_precision,test_recall,test_recall,test_f1,test_f1
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
GB,50.219681,7.980814,0.007844,0.002278,0.988333,0.0252,0.988955,0.023289,0.99011,0.021353,0.988554,0.024304
KNN,0.068988,0.010007,0.008644,0.00187,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
LR,0.067378,0.115742,0.010618,0.012737,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
SVM,0.048589,0.007366,0.008011,0.002409,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [14]:
# Criando funções para selecionar o melhor modelo
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(df_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,GB,KNN,LR,SVM
fit_time,50.220 ± 7.847,0.069 ± 0.010,0.067 ± 0.114,0.049 ± 0.007
score_time,0.008 ± 0.002,0.009 ± 0.002,0.011 ± 0.013,0.008 ± 0.002
test_accuracy,0.988 ± 0.025,1.000 ± 0.000,1.000 ± 0.000,1.000 ± 0.000
test_precision,0.989 ± 0.023,1.000 ± 0.000,1.000 ± 0.000,1.000 ± 0.000
test_recall,0.990 ± 0.021,1.000 ± 0.000,1.000 ± 0.000,1.000 ± 0.000
test_f1,0.989 ± 0.024,1.000 ± 0.000,1.000 ± 0.000,1.000 ± 0.000


O melhor modelo é o : **KNN**

In [15]:
# Realizando treino do modelo completo

model_name, model, model_params  = [foo for foo in models if foo[0] == winner][0]
model_gs = GridSearchCV(model, model_params, scoring='accuracy')
approach = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model_gs)
])
approach.fit(X, y)

In [16]:
# Salvando modelo treinado
joblib.dump(approach, '../models/model.joblib') 

['../models/model.joblib']