In [None]:
!pip install autogoal
!pip install autogoal[wikipedia]
!pip install autogoal[sklearn]
!pip install autogoal[gensim]
!pip install autogoal[transformers]
!pip install autogoal[keras]
!pip install autogoal[spacy]
!pip install WordCloud
!pip install NLTK
!pip install plotly


import nltk
from wordcloud import STOPWORDS
nltk.download('stopwords')
from nltk.corpus import stopwords
# para pre-procesamiento del texto y extraer características
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.snowball import SpanishStemmer

stop_words_sp = set(stopwords.words('spanish'))

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

#  para construir gráficas y realizar análisis exploratorio de los datos
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

# para guardar el modelo
import pickle

# para construir pipelines
from sklearn.pipeline import Pipeline

# para evaluar los modelos 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler


# algoritmos de clasificación
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

TEXT_COL      = 'tweet'
CLASS_COL_ST1 = 'Sentiment'
CLASS_COL_ST2 = ['Theft', 'Homicide', 'Kidnapping', 'Accident', 'None of the above']

# Declaramos algunas variables globales
N_JOBS = 6 # Número de núclos a implementar por gridsearch para el hyper parámeter tuning
CV = 5 # Número de interaciones para hacer cross validation.

In [None]:
# función auxiliar utilizada por CountVectorizer para procesar las frases
def spanish_stemmer(sentence):
    stemmer = SpanishStemmer()
    analyzer = CountVectorizer(binary=False, analyzer='word', stop_words=stop_words_sp,
                               ngram_range=(1, 1)).build_analyzer()
    return (stemmer.stem(word) for word in analyzer(sentence))


# guarda un pipeline entrenado
def save_model(model, modelName = "pickle_model.pkl"):
   pkl_filename = modelName
   with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)   


# carga un pipeline entrenado y guardado previamente
def load_model(rutaModelo = "pickle_model.pkl"):
  # Load from file
  with open(rutaModelo, 'rb') as file:
    pickle_model = pickle.load(file)
    return pickle_model 


# función auxiliar para realizar predicciones con el modelo
def predict_model(model, data, pref='m'):
  """
  data: list of the text to predict
  pref: identificador para las columnas (labels_[pref], scores_[pref]_[class 1], etc.)
  """
  res = {}
  scores = None
  labels = model.predict(data)

  if hasattr(model, 'predict_proba'):
    scores = model.predict_proba(data)
  
    # empaquetar scores dentro de un diccionario que contiene labels, scores clase 1, scores clase 2, .... El nombre de la clase se normaliza a lowercase
    #res = {f'scores_{pref}_{cls.lower()}':score for cls, score in zip(model.classes_, [col for col in scores.T])}
    res = {f'scores_{pref}_{cls}':score for cls, score in zip(model.classes_, [col for col in scores.T])}

  # añadir datos relativos a la predicción
  res[f'labels_{pref}'] = labels

  # convertir a dataframe ordenando las columnas primero el label y luego los scores por clase, las clases ordenadas alfabeticamente.
  res = pd.DataFrame(res, columns=sorted(list(res.keys())))

  return res


# función auxiliar que evalúa los resultados de una clasificación
def evaluate_model(y_true, y_pred, y_score=None, pos_label='positive'):
  """
  
  """
  print('==== Sumario de la clasificación ==== ')
  print(classification_report(y_true, y_pred))

  print('Accuracy -> {:.2%}\n'.format(accuracy_score(y_true, y_pred)))

  # graficar matriz de confusión
  display_labels = sorted(unique_labels(y_true, y_pred), reverse=True)
  cm = confusion_matrix(y_true, y_pred, labels=display_labels)

  z = cm[::-1]
  x = display_labels
  y =  x[::-1].copy()
  z_text = [[str(y) for y in x] for x in z]

  fig_cm = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

  fig_cm.update_layout(
      height=400, width=400,
      showlegend=True,
      margin={'t':150, 'l':0},
      title={'text' : 'Matriz de Confusión', 'x':0.5, 'xanchor': 'center'},
      xaxis = {'title_text':'Valor Real', 'tickangle':45, 'side':'top'},
      yaxis = {'title_text':'Valor Predicho', 'tickmode':'linear'},
  )
  fig_cm.show()

In [None]:
# leemos el corpus de tweets y los labels correspondientes a 
# las tareas 1 de clasificación binaria y clasificación multilabel
df_train            = pd.read_csv('https://raw.githubusercontent.com/carlossuazo/davincis-iberlef-2022/main/data/training_data/train_data.csv', header=None, names = [TEXT_COL])
df_train_labels_st1 = pd.read_csv('https://raw.githubusercontent.com/carlossuazo/davincis-iberlef-2022/main/data/training_data/train_labels_subtask_1.csv', header=None, names = [CLASS_COL_ST1])
df_train_labels_st2 = pd.read_csv('https://raw.githubusercontent.com/carlossuazo/davincis-iberlef-2022/main/data/training_data/train_labels_subtask_2.csv', header=None, names = CLASS_COL_ST2)

In [None]:

df_train_lst1 = pd.concat([df_train, df_train_labels_st1], axis = 1)
df_train_lst2 = pd.concat([df_train, df_train_labels_st2], axis = 1)
display(df_train_lst1)
display(df_train_lst2)

In [None]:
# obtener algunas estadísticas sobre los datos
categories = sorted(df_train_lst1[CLASS_COL_ST1].unique(), reverse=False)
hist= Counter(df_train_lst1[CLASS_COL_ST1]) 
print(f'Total de instancias -> {df_train_lst1.shape[0]}')

print(f'Categorías -> {categories}')
print(f'Comentario de ejemplo -> {df_train_lst1[CLASS_COL_ST1][0]}')
print(f'Categoría del comentario -> {df_train_lst1[CLASS_COL_ST1][0]}')

fig = go.Figure(layout=go.Layout(height=400, width=600))
fig.add_trace(go.Bar(x=categories, y=[hist[cat] for cat in categories]))
fig.show()
# Hacer entrenamiento con el corpus desbalanceado
# Probar rellenando los datos de las categorías 0 y 1, cortar los datos de la categoría 2 utilizar la función iloc, traer 1052 de cada categoría 
print('Done!')