In [17]:
import pandas as pd
import sys; sys.path.append('..')
from prototype.web.pretrain.pretrained_models import models
pd.set_option('max_colwidth', None)


class SentimentEmotion(object):
     extra_fields = ['sent_neg', 'sent_neutral', 'sent_pos', 'joy', 'sadness',
                     'anger', 'surprise', 'disgust', 'fear', 'others', 'hateful', 'targeted', 'aggressive']
     def __init__(self):
          self.sentiment_analyzer, self.emotion_analyzer, self.hate_speech_analyzer, \
          self.ner_analyzer, self.pos_tagger = models()

     def __extract_sentiment(self, text):
          probas = self.sentiment_analyzer.predict(text).probas
          return probas['NEG'], probas['NEU'], probas['POS']

     def __extract_emotion(self, text):
          probas = self.emotion_analyzer.predict(text).probas
          return probas['joy'], probas['sadness'], probas['anger'], probas['surprise'], probas['disgust'], probas[
               'fear'], probas['others']

     def __extract_hate(self, text):
          probas = self.hate_speech_analyzer.predict(text).probas
          return probas['hateful'], probas['targeted'], probas['aggressive']

     def extract(self, df, text_field, sample=30):
          self.text_field = text_field  # 'tema'
          dataframe = df
#           if sample is not None:
#                textual = dataframe.sample(sample, random_state=33)
#           else:
          textual = dataframe
          textual['sent_neg'], textual['sent_neutral'], textual['sent_pos'] = zip(
               *textual[self.text_field].apply(lambda x: self.__extract_sentiment(x)))

          textual['joy'], textual['sadness'], textual['anger'], textual['surprise'], textual['disgust'], textual[
               'fear'], textual['others'] = zip(*textual[self.text_field].apply(lambda x: self.__extract_emotion(x)))

          textual['hateful'], textual['targeted'], textual['aggressive'] = zip(
               *textual[self.text_field].apply(lambda x: self.__extract_hate(x)))

          return textual


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


class TopicDiscovery(object):
    """Topic Discovery class based on LatentDirichletAllocation"""
    def __init__(self):
        print('##### TopicDiscovery __init__ #####')

    def preprocessing(self, data_tr, text_field):
        """Dataset preprocessing"""
        data_tr = data_tr.dropna()
        data_tr[text_field] = data_tr[text_field].str.lower()
        data_tr[text_field] = data_tr[text_field].replace(',', "")
        data_tr[text_field] = data_tr[text_field].replace('.', "")
        data_tr[text_field] = data_tr[text_field].replace(';', "")
        return data_tr

    def __get_stopwords(self, stop_file_path):
        """load stop words """
        with open(stop_file_path, 'r', encoding="utf-8") as f:
            stopwords = f.readlines()
            stop_set = set(m.strip() for m in stopwords)
            stop_spanish = list(frozenset(stop_set))
            stop_spanish.extend(['si', 'ahi', 'ahí', 'ah'])
            return stop_spanish

    def discover(self, data, text_field, stopw_path):
        """Discovery fun algorithm :) """
        stop_spanish = self.__get_stopwords(stopw_path)
        count = CountVectorizer(stop_words=stop_spanish, max_df=0.1, max_features=5000)

        X = count.fit_transform(data[text_field].values)

        n_comp = np.where(len(data.index) < 500, 8,
                          np.where(len(data.index) < 1000, 10,
                                   np.where(len(data.index) < 2000, 12,
                                            np.where(len(data.index) < 3000, 15, 20))))

        n_comp = n_comp[()]
        n_comp = 5
        lda = LatentDirichletAllocation(n_components=n_comp, random_state=123, learning_method='batch')

        X_topics = lda.fit_transform(X)

        n_top_words = 3
        topics_discovered = []
        feature_names = count.get_feature_names_out()
        for topic_idx, topic in enumerate(lda.components_, 1):
            #     print(f'Topic {topic_idx}')
            discovered = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
            #     print(' '.join(discovered))
            topics_discovered.append(discovered)

        topics_discovered = [f'{t[0]} | {t[1]} | {t[2]}' for t in topics_discovered]
        print(topics_discovered)

        topics = pd.DataFrame(lda.transform(X))
        topics_encode = pd.DataFrame(np.where(topics >= 0.3, 1, 0))
        other = pd.DataFrame(np.where(topics_encode.sum(axis=1) == 0, 1, 0))
        other.columns = ["otros"]

        topics_encode.columns = topics_discovered

        full_data = data.join(topics_encode)
        full_data = full_data.join(other)

        return full_data


In [18]:
dataframe = pd.read_csv('./notebooks/dataset/banca_respuestas.csv', sep=";", encoding_errors="ignore", encoding='utf_8')
dataframe.dropna(axis=0, inplace=True)
td = TopicDiscovery()
se = SentimentEmotion()


##### TopicDiscovery __init__ #####
####### Init pre-trained models #########


loading configuration file config.json from cache at /Users/beltre.wilton/.cache/huggingface/hub/models--pysentimiento--robertuito-sentiment-analysis/snapshots/e3be95c8efad7f480ce8aab2221188ecb78e40f3/config.json
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-sentiment-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /Users/beltre.wilton/.cache/huggingface/hub/models--pysentimiento--robertuito-ner/snapshots/c5c1a4673c8e833e9a66b5bf2942988e65349538/config.json
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-ner",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-EVENT",
    "2": "I-EVENT",
    "3": "B-GROUP",
    "4": "I-GROUP",
    "5": "B-LOC",
    "6": "I-LO

In [19]:
text_field = 'razones'

dataframe = td.preprocessing(data_tr=dataframe, text_field=text_field)


dataframe.shape

(2471, 3)

In [22]:
df_feaured.shape

(2471, 9)

In [21]:
td = TopicDiscovery()
df_feaured = td.discover(data=dataframe, text_field=text_field, stopw_path='./prototype/web/components/stopwords_spanish.txt')


# df_feaured = df_feaured.sample(500, random_state=42)


##### TopicDiscovery __init__ #####
['crdito | hacer | dinero', 'ms | intereses | inters', 'colas | ir | evito', 'problema | comision | accesible', 'sistema | retirar | agentes']


In [8]:
df_feaured.shape

(2471, 9)

In [23]:
df_feaured = se.extract(df=df_feaured, text_field=text_field)



In [26]:
df_feaured.to_csv('./notebooks/dataset/banca_respuestas_se_topic_features.csv', index=False)