# Workshop Final - NPL - Clasificación de Sentimientos

**Integrantes del grupo:**

- Kevin Tasat
- Martha Alvarez
- Fernando Roa

### Resumen



El dataset a evaluar tiene un total de 16000 textos a los cuales se les asocian diferentes sentimientos o emociones. El objetivo de este trabajo es mediante análisis de lenguaje natural preparar la información y posteriormente entrenar un modelo de clasificación de emociones.

## Telegram BOT

### Cargue de librerías

In [5]:
import telegram.ext
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

### Creación de clases

In [17]:
# Clase para identificar negación antes de una palabra
# transforma la negación en not y transforma la palabra en su antónimo
# Lo anterior trata de invertir el significado luego de pasar el texto por 
# stop words

class negation_transformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    @staticmethod
    # negate_sequence(text)
    #   text: sentence to process (creation of uni/bi
    #    is handled here)
    #
    # Detects negations and transforms negated words into 'not_' form
    #
    def negate_sequence(text):

        def antonyms_for(word):
            antonyms = set()
            for ss in wn.synsets(word):
                for lemma in ss.lemmas():
                    any_pos_antonyms = [ antonym.name() for antonym in lemma.antonyms() ]
                    for antonym in any_pos_antonyms:
                        antonym_synsets = wn.synsets(antonym)
                        if wn.ADJ not in [ ss.pos() for ss in antonym_synsets ]:
                            continue
                        antonyms.add(antonym)
            if antonyms==set():
                return word
            else:
                antonyms=list(antonyms)
                return antonyms[0]

        negation = False
        delims = "?.,!:;"
        result = []
        words = text.split()
        prev = None
        pprev = None
        for word in words:
            stripped = word.strip(delims).lower()
            negated = "not " + stripped if negation else stripped
            result.append(negated)

            if prev:
                bigram = prev + " " + negated
                pprev = prev
            prev = negated

            if any(neg in word for neg in ["not", "n't", "no"]):
                negation = not negation

            if any(c in word for c in delims):
                negation = False

        for elements in result:
            
            element=word_tokenize(elements)
            
            element0=0
            element_change=1
            
            if element[0]=="not" and len(element)>1 and element[1]!="feel":
                
                element0=" ".join(element)
                
                element_change=antonyms_for(element[1])
        
            result=[element_change if item == element0 else item for item in result]

        return " ".join(result)
    
    def fit(self, texto, y=None):
        return self

    def transform(self, texto):
        if isinstance(texto, pd.Series):
            texto=texto.apply(lambda x: self.negate_sequence(x))
        else:
            texto=pd.Series(texto)
            texto=texto.apply(lambda x: self.negate_sequence(x))
            
        return texto

In [7]:
# Clase para limpiar texto

class clean_texto(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    @staticmethod
    def clean_text(text):
        '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

        # Convert words to lower case
        text = text.lower()

        # Replace contractions with their longer forms 
        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)

        # Format words and remove unwanted characters
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # remove stop words
        #if remove_stopwords:
        text = text.split()
        stops = stop_words
        text = [w for w in text if not w in stops]
        text = " ".join(text)

        # spelling correction (tarda demasiado y no da buenos resultados)
        #text = TextBlob(text).correct() 
        #text = text.__str__() # opcion 1 (tarda demasiado)
        #text = '{}'.format(text) #opcion2 (tarda demasiado)
        #text = str(TextBlob(text).correct()) #opcion3 (tarda demasiado)

        #Lemmatize 

        #Defaultdict is a container like dictionaries present in the module collections. 
        #Defaultdict is a sub-class of the dictionary class that returns a dictionary-like object. 
        #The functionality of both dictionaries and defaultdict are almost same except for the fact that defaultdict never raises a KeyError. 
        #It provides a default value for the key that does not exists.
        
        tag_map = defaultdict(lambda : wn.NOUN)
        tag_map['J'] = wn.ADJ
        tag_map['V'] = wn.VERB
        tag_map['R'] = wn.ADV

        tokens = word_tokenize(text)
        lmtzr = WordNetLemmatizer()

        new_text2=[]

        for token, tag in pos_tag(tokens):
            lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
            new_text2.append(lemma)
            text=" ".join(new_text2)
        
        return text
    
    def fit(self, texto, y=None):
        return self

    def transform(self, texto):
        if isinstance(texto, pd.Series):
            texto=texto.apply(lambda x: self.clean_text(x))
        else:
            texto=pd.Series(texto)
            texto=texto.apply(lambda x: self.clean_text(x))
            
        return texto

In [8]:
# Cargamos el modelo

path_pipe = "pipe_model.sav"
pipe = joblib.load(path_pipe)

### Cargue del bot de telegram

In [9]:
with open('lala.py','r') as f:
    token=f.read()

In [10]:
updater=telegram.ext.Updater(token,use_context=True)
disp=updater.dispatcher

In [11]:
def start(update,context):
    update.message.reply_text("Hello I'm SentiBot, I'm here for you, please tell me how you feel")

In [12]:
def ayuda(update, context):
     update.message.reply_text("This is a Sentiment Analyzer Bot created by Group 2 of DH Academy")

In [13]:
def handle_message(update,context):   
    user_message=update.message.text
    if user_message == "111":
        return update.message.reply_text("That's not a feeling")
    else:
        prediction= pipe.predict(user_message)[0]
        return update.message.reply_text(f"You feel {prediction}")
    

In [14]:
disp.add_handler(telegram.ext.CommandHandler("start",start))
disp.add_handler(telegram.ext.CommandHandler("help",ayuda))
disp.add_handler(telegram.ext.MessageHandler(telegram.ext.Filters.text,handle_message))

In [15]:
updater.start_polling()
updater.idle()