In [1]:
# importando bibliotecas 
import pandas as pd
import numpy as np
import nltk
import warnings
import joblib
from pathlib import Path
from IPython.display import Markdown
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# ignorando warnings
warnings.filterwarnings('ignore')

# importando stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to /home/madson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# carregamento conjunto de dados
data_path = Path("../data/raw/data.csv")

# carregamento dicionário de dados
dict_path = Path("../data/external/dicionario.csv")

datasets = [
    ("lemmatization_sem_stopwords", Path('../data/processed/lemmatization_sem_stopwords.csv')),
    ("lemmatization_com_stopwords", Path('../data/processed/lemmatization_com_stopwords.csv')),
    ("stemming_com_stopwords", Path('../data/processed/stemming_com_stopwords.csv')),
    ("stemming_sem_stopwords", Path('../data/processed/stemming_sem_stopwords.csv'))
]

In [3]:
df = pd.read_csv(Path('../data/processed/lemmatization_sem_stopwords.csv')).sample(100)

In [4]:
# leitura conjunto de dados
df_data = pd.read_csv(data_path, sep=";").sample(100)

# padronização tweet_date
df_data['tweet_date'] = pd.to_datetime(df_data['tweet_date'])
df_data['tweet_date'] = df_data.tweet_date.dt.tz_convert('Brazil/East')

# visualização dados
display(Markdown("### Dados"))
display(df_data.head())

# leitura dicionário de dados
df_dict = pd.read_csv(dict_path)

# visualização dicionário de dados
display(Markdown("### Dicionário"))
display(df_dict.head())

### Dados

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
19981,1046795689781551111,@rafinhabastos Cabo Daciolo é O Cara!!! :D,2018-10-01 13:15:15-03:00,1,:)
84397,1039248939004964865,País ganha primeiro fundo de renda fixa com ap...,2018-09-10 17:27:10-03:00,2,veja
77797,1043666255268450305,'O tempo não para': Samuca e Waleska quase se ...,2018-09-22 22:00:00-03:00,2,jornaloglobo
1837,1050737799421448192,Feliz dia das crianças pra vc que tem essas at...,2018-10-12 10:19:47-03:00,1,:)
56753,1046783240294547456,is dis marupokkk? — Sobra :(( https://t.co/cb8...,2018-10-01 12:25:47-03:00,0,:(


### Dicionário

Unnamed: 0,variavel,significado,tipo,valores
0,id,ID único por usuário,useless,
1,tweet_text,Texto publicado,text,
2,tweet_date,Data de publicação,time,
3,sentiment,Algorítmo de classificação do sentimento do us...,nominal,"[0,1,2]"
4,query_used,Palavra relevante,nominal,"[':)', ':(', 'veja', 'jornaloglobo', 'g1', 'fo..."


In [5]:
target_column = "sentiment"
useless_columns =  df_dict.query("tipo == 'useless'").variavel.to_list()
useless_columns.append('query_used')
nominal_columns = (
    df_dict
    .query(
        "tipo == 'nominal' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
text_columns = (
    df_dict
    .query(
        "tipo == 'text' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)
time_columns = (
    df_dict
    .query(
        "tipo == 'time' and "
        "variavel not in @useless_columns and "
        "variavel != @target_column"
    )
    .variavel
    .to_list()
)

In [6]:
nominal_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    ("missing", SimpleImputer(strategy='most_frequent')), # Tratamento de dados faltantes
    ("encoder", OneHotEncoder(sparse=False,handle_unknown='ignore')), # Codificação de variáveis
    # Seleção de variáveis
    ("normalization", StandardScaler()), # Normalização
])
text_preprocessor = Pipeline([
    ("bag of words", Pipeline([('count', CountVectorizer(max_features=1000, strip_accents='ascii', lowercase=True)),('tfid', TfidfTransformer())])),
    # Tratamento de dados faltantes
    # Codificação de variáveis
    # Seleção de variáveis
    ("pca", TruncatedSVD(n_components=500, random_state=42)) # Redução de dimensionalidade - PCA
    #("normalization", StandardScaler()) # Normalização
])
time_preprocessor = Pipeline([
    # Tratamento de dados discrepantes
    # Tratamento de dados faltantes
    ("encoder", OrdinalEncoder()), # Codificação de variáveis
    # Seleção de variáveis
    # Normalização
])

In [7]:
preprocessor = ColumnTransformer(transformers=[
    # ("nominal", nominal_preprocessor, nominal_columns),
    ("text", text_preprocessor, text_columns[0]), # setando a coluna tweet_text
    #("time", time_preprocessor, time_columns),
])

In [8]:
# Definindo parâmetros dos modelos
models = [
    (
        "LR",
        LogisticRegression(solver='liblinear', max_iter=1000),
        {"penalty": ['l1', 'l2']}
    ),
    (
        "KNN",
        KNeighborsClassifier(metric='euclidean'),
        {"n_neighbors": [3, 5, 11, 15]}
    ),
    (
        "SVM",
        SVC(max_iter=1000),
        {"kernel": ['linear', 'rbf']}
    ),
    (
        "NB",
        GaussianNB(),
        {'var_smoothing': np.logspace(0,-9, num=3)}
    ),
]

In [9]:
# Definindo métricas
metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

In [10]:
# Definindo scoring para GridSearchCV
scoring_metric = make_scorer(recall_score, average='macro')

In [11]:
# Separando características previsores da classe
#X = df_data.drop(columns=[*useless_columns, target_column], axis=1)


# Definindo cross validation
cv = ShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

In [12]:
# Realizando treinamento dos modelos selecionados
results = {}

for dataset_name, dataset_path in datasets:
    df = pd.read_csv(dataset_path)
    X = df.drop(columns=[*useless_columns, *time_columns, target_column], axis=1)
    y = df[[target_column]].to_numpy().ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8, random_state=42, shuffle=True)

    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    model_results = {}
    for model_name, model_obj, model_params in models:
        print(f'{model_name}-{dataset_name} run...')
        model_gs = GridSearchCV(model_obj, model_params, scoring='accuracy')
        # approach = Pipeline([
        #     ("preprocessing", preprocessor),
        #     ("model", model_gs)
        # ])
        # model_results = cross_validate(
        #     approach,
        #     X=X,
        #     y=y,
        #     scoring=metrics,
        #     cv=cv,
        #     n_jobs=4
        # )
        model_gs.fit(X_train_transformed, y_train)
        y_hat = model_gs.predict(X_test_transformed)
        
        for metric_name, metric_func in metrics.items():
            result = metric_func(model_gs, X_test_transformed, y_test)
            if metric_name in model_results.keys():
                model_results[metric_name] = np.append(model_results[metric_name], result)
            else:
                model_results[metric_name] = [result]
        model_results['name'] = [f"{model_name}-{dataset_name}"] * len(model_results['precision'])
        if results:
            for key, value in model_results.items():
                results[key] = np.append(results[key], value)
        else:
            results = model_results

LR-lemmatization_sem_stopwords run...
KNN-lemmatization_sem_stopwords run...
SVM-lemmatization_sem_stopwords run...
NB-lemmatization_sem_stopwords run...
LR-lemmatization_com_stopwords run...
KNN-lemmatization_com_stopwords run...
SVM-lemmatization_com_stopwords run...
NB-lemmatization_com_stopwords run...
LR-stemming_com_stopwords run...
KNN-stemming_com_stopwords run...
SVM-stemming_com_stopwords run...
NB-stemming_com_stopwords run...
LR-stemming_sem_stopwords run...
KNN-stemming_sem_stopwords run...
SVM-stemming_sem_stopwords run...
NB-stemming_sem_stopwords run...


In [15]:
# Criando dataframe com os resultados
df_results = pd.DataFrame(results)
# df_results.groupby('name').agg([np.mean, np.std])

In [16]:
results_ = results

In [17]:
# Criando funções para selecionar o melhor modelo
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]

results = (
    pd
    .DataFrame(df_results)
    .groupby(['name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))

score,KNN-lemmatization_com_stopwords,KNN-stemming_com_stopwords,KNN-stemming_sem_stopwords,LR-lemmatization_com_stopwords,LR-stemming_com_stopwords,LR-stemming_sem_stopwords,NB-lemmatization_com_stopwords,NB-lemmatization_sem_stopwords,NB-stemming_com_stopwords,NB-stemming_sem_stopwords,SVM-lemmatization_com_stopwords,SVM-stemming_com_stopwords,SVM-stemming_sem_stopwords
accuracy,0.739 ± 0.046,0.727 ± 0.061,0.722 ± 0.049,0.785 ± 0.000,0.789 ± 0.000,0.771 ± 0.000,0.691 ± 0.077,0.714 ± 0.050,0.695 ± 0.077,0.685 ± 0.071,0.731 ± 0.039,0.732 ± 0.050,0.721 ± 0.040
precision,0.741 ± 0.040,0.732 ± 0.053,0.732 ± 0.037,0.781 ± 0.000,0.785 ± 0.000,0.768 ± 0.000,0.690 ± 0.079,0.714 ± 0.049,0.697 ± 0.074,0.689 ± 0.071,0.732 ± 0.035,0.734 ± 0.043,0.727 ± 0.031
recall,0.739 ± 0.046,0.727 ± 0.061,0.722 ± 0.049,0.785 ± 0.000,0.789 ± 0.000,0.771 ± 0.000,0.691 ± 0.077,0.714 ± 0.050,0.695 ± 0.077,0.685 ± 0.071,0.731 ± 0.039,0.732 ± 0.051,0.721 ± 0.040
f1,0.739 ± 0.043,0.728 ± 0.058,0.724 ± 0.045,0.783 ± 0.000,0.787 ± 0.000,0.769 ± 0.000,0.689 ± 0.080,0.713 ± 0.050,0.691 ± 0.082,0.681 ± 0.078,0.731 ± 0.037,0.732 ± 0.048,0.722 ± 0.037


O melhor modelo é o : **LR-stemming_com_stopwords**

In [25]:
# Realizando treino do modelo completo
model_winner = winner.split("-")[0]
dataset_winner = winner.split("-")[1]
model_name, model, model_params  = [foo for foo in models if foo[0] == model_winner][0]

dataset_name, dataset_path  = [foo for foo in datasets if foo[0] == dataset_winner][0]

df = pd.read_csv(dataset_path).sample(frac=1)
X = df.drop(columns=[*useless_columns, *time_columns, target_column], axis=1)
y = df[[target_column]].to_numpy().ravel()


model_gs = GridSearchCV(model, model_params, scoring='accuracy')
approach = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model_gs)
])


approach.fit(X, y)

In [26]:
# Salvando modelo treinado
joblib.dump(approach, '../models/model.joblib') 

['../models/model.joblib']