In [1]:
import sys
sys.path.append('../')

In [3]:
from utils.utils import Utils
#from utils.preprocess import Preprocess

import pandas as pd
import numpy as np

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# File of the class PreprocessText, containing various functions for text preprocessing
# File: preprocess.py
# Author: Atharva Kulkarni


import pandas as pd
import numpy as np
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words, wordnet, brown
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import unicodedata
from pycontractions import Contractions
from autocorrect import Speller
from utils import Utils

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('brown')






class Preprocess():
    """" Class containing various helper functions """   
    
    
    # -------------------------------------------- Class Constructor --------------------------------------------
    
    def __init__(self, mode="normalize", contractions_model_path="/home/eastwind/word-embeddings/word2vec/GoogleNews-vectors-negative300.bin"):
        """ Class Constructor
        @param contractions_model_path (str): model to be loaded for contractions expansion.
        """
        self.utils = Utils()
        self.stop_words = stopwords.words('english')
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.speller = Speller(lang='en')
        self.wordlist = set(words.words()).union(set(wordnet.words()), set(brown.words()))
        self.nouns = ['NNP', 'NNPS']
        self.nlp = spacy.load('en_core_web_sm')
        if mode == "normalize":
            self.cont = Contractions(contractions_model_path)
            self.cont.load_models()
            
        
            
        
        
       
     
    # -------------------------------------------- Function to expand contractions --------------------------------------------
    
    def expand_contractions(self, text):
        """ Function to expand contractions
        @param text (str): input text to euxpand contractions.
        return text (str): Contraction expanded text.
        """
        text = list(self.cont.expand_texts([text], precise=True))[0]
        return text
    
      
    
    
    # -------------------------------------------- Function to Correct Spellings --------------------------------------------
       
    def correct_spelling(self, word, pos):
        """ Function to autocorrect words
        @param word (str): misspelled words
        @param proper_noun (list): list of proper nouns to ignore
        return corrected word

        """
        if word.lower() in self.wordlist or pos in self.nouns:
            return word
        else:
            return self.speller(word.lower())
        
        
        
    
    # --------------------------------------- Remove Wordplay ---------------------------------------
    
    def remove_wordplay(self, word, pos):
        pattern = re.compile(r"(\w*)(\w)\2(\w*)")
        substitution_pattern = r"\1\2\3"
        while True:
            if word.lower() in self.wordlist or pos in self.nouns:
                return word
            new_word = pattern.sub(substitution_pattern, word)
            if new_word != word:
                word = new_word
                continue
            else:
                return new_word
                
                
                
                    
    # -------------------------------------------- Function to normalize input text --------------------------------------------
    
    def normalize_text(self, text):
        """ Function to normalzie text inputs.
        @param text (str): Input text.
        """
        # Adding space for all puntuation marks
        text = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", text)

        # Remove accented words
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # Remove long spaces
        text = re.sub(r'^\s*|\s\s*', ' ', text).strip()

        tokenized_text = text.split()
        
        abbr_dict = self.utils.get_dict("/home/eastwind/PycharmProjects/WASSA-2021-Shared-Task/resources/custom-dictionaries/social-media-abbreviations.csv", key_column="acronym", value_column="full_form")
        for i in range(len(tokenized_text)):
            x = re.sub(r'[^\w\s]', '', tokenized_text[i]).lower()
            
            # Expand acronyms
            if x in abbr_dict.keys():
                tokenized_text[i] = abbr_dict[x]

            # Expand contracitons
            tokenized_text[i] = self.expand_contractions(tokenized_text[i])    
        
        text = " ".join([word for word in tokenized_text])
        text = self.nlp(text)

        # Remove wordplay
        text = " ".join([self.remove_wordplay(word.text, word.tag_) for word in text])
        
        # Correct Spellings
        text = self.nlp(text)
        text = " ".join([self.correct_spelling(word.text, word.tag_) for word in text])
        #text = self.speller(text)
        return text
        

        
      
    # -------------------------------------------- Function to Normalize corpus --------------------------------------------
        
    def normalize_corpus(self, df, column_name):
        """ Function to normalize corpus.
        @param corpus (list): corpus list.
        @param column_name (str): name of column to normalize.
        """
        df[column_name] = df[column_name].apply(lambda text: self.normalize_text(text))
        return df
    
    
    
    
    # -------------------------------------------- Function to clean text --------------------------------------------
        
    def clean_text(self, text, remove_stopwords=True, lemmatize=True):
        """ Function to clean text
        @param text (str): text to be cleaned
        @param remove_stopwords (bool): To remove stopwords or not.
        @param lemmatize (bool): to lemmatize or not.
        """
        # Remove emails 
        text = re.sub('\S*@\S*\s?', '', text)
        
        # Remove new line characters 
        text = re.sub('\s+', ' ', text) 
        
        # Remove distracting single quotes 
        text = re.sub("\'", '', text)

        # Remove puntuations and numbers
        text = re.sub('[^a-zA-Z]', ' ', text)

        # Remove single characters
        text = re.sub('\s+[a-zA-Z]\s+^I', ' ', text)
        
        # remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        text = text.lower()

        if not remove_stopwords and not lemmatize:
            return text

        # Remove unncecessay stopwords
        if remove_stopwords:
            text = word_tokenize(text)
            text = " ".join([word for word in text if word not in self.stop_words])
        
        # Word lemmatization
        if lemmatize:
            text = self.nlp(text)
            lemmatized_text = []
            for word in text:
                if word.lemma_.isalpha():
                    if word.lemma_ != '-PRON-':
                        lemmatized_text.append(word.lemma_.lower())
                    # else:
                        # lemmatized_text.append(word.lower())
            text = " ".join([word.lower() for word in lemmatized_text])
                
        return text
        
        
        
        

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eastwind/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eastwind/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/eastwind/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /home/eastwind/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [5]:
%%time
utils = Utils()
pre = Preprocess(mode="normalize")

CPU times: user 39.5 s, sys: 5.45 s, total: 45 s
Wall time: 1min 30s


In [62]:
%%time
emo_data = utils.read_data("../dataset/test/gold_standard_test_EMO.tsv")
emo_data.shape

CPU times: user 9.5 ms, sys: 0 ns, total: 9.5 ms
Wall time: 56 ms


(525, 1)

In [63]:
%%time
data = utils.read_data("../dataset/test/messages_test_features_ready_for_WS.tsv")
data.shape

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 463 ms


(525, 18)

In [64]:
data['gold_emotion'] = emo_data.emotion.values.tolist()
# data['gold_empathy'] = emo_data.empathy.values.tolist()
# data['gold_distress'] = emo_data.distress.values.tolist()
data.shape

(525, 19)

In [65]:
post_no = 111
data.essay.values.tolist()[post_no]

"This isn't very surprising at all, is it? Another one of Putin's bodies. I like how it's just a thing that we expect to happen, and no one really cares very much except as a minor blip on the news. It's just expected, and the fact that people in our own government have an association with people who operate like this is largely going to be gotten away with. What do you think?"

In [66]:
%%time
normalized_data = pre.normalize_corpus(data.copy(), column_name="essay")
normalized_data

CPU times: user 17.7 s, sys: 71.4 ms, total: 17.8 s
Wall time: 19.6 s


Unnamed: 0,message_id,response_id,article_id,essay,gender,education,race,age,income,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,gold_emotion
0,R_2qabL3aLPfRMyA1_1,R_2qabL3aLPfRMyA1,17,"Hello Friend , i am writing to you as regards ...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,sadness
1,R_2qabL3aLPfRMyA1_2,R_2qabL3aLPfRMyA1,164,Hello friend i will like to tell you that Indi...,2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,joy
2,R_2qabL3aLPfRMyA1_3,R_2qabL3aLPfRMyA1,196,Hello friend I will like to let you know Leona...,2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
3,R_2qabL3aLPfRMyA1_4,R_2qabL3aLPfRMyA1,259,"Hello friend , I will like to tell you Qatar l...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
4,R_2qabL3aLPfRMyA1_5,R_2qabL3aLPfRMyA1,361,"Dear friend , I will like to know that Trump s...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,R_1gShpZuf35KXreS_1,R_1gShpZuf35KXreS,73,"Hey , I have always liked Billy Bob Thornton a...",2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,joy
521,R_1gShpZuf35KXreS_2,R_1gShpZuf35KXreS,164,The fact that Donald Trump just peaced out of ...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,anger
522,R_1gShpZuf35KXreS_3,R_1gShpZuf35KXreS,182,I am not okay ! How can anyone harm something ...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,sadness
523,R_1gShpZuf35KXreS_4,R_1gShpZuf35KXreS,260,I do not know anything outside of this article...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,sadness


In [67]:
normalized_data.essay.values.tolist()[post_no]

"This is not very surprising at all , is it ? Another one of Putin 's bodies . I like how it is just a thing that we expect to happen , and no one really cares very much except as a minor blip on the news . it is just expected , and the fact that people in our own government have an association with people who operate like this is largely going to be gotten away with . What do you think ?"

In [56]:
# normalized_data['gold_empathy_bin'] = normalized_data.gold_empathy.apply(lambda x: 1 if x>=4.0 else 0)
# normalized_data.shape

(1860, 27)

In [57]:
# normalized_data['gold_distress_bin'] = normalized_data.gold_distress.apply(lambda x: 1 if x>=4.0 else 0)
# normalized_data.shape

(1860, 28)

In [68]:
normalized_data.to_csv("../dataset/test/test-data-normalized.csv", sep=",", index=False)

In [70]:
df = pd.read_csv("../dataset/test/test-data-normalized.csv")
df

Unnamed: 0,message_id,response_id,article_id,essay,gender,education,race,age,income,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,gold_emotion
0,R_2qabL3aLPfRMyA1_1,R_2qabL3aLPfRMyA1,17,"Hello Friend , i am writing to you as regards ...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,sadness
1,R_2qabL3aLPfRMyA1_2,R_2qabL3aLPfRMyA1,164,Hello friend i will like to tell you that Indi...,2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,joy
2,R_2qabL3aLPfRMyA1_3,R_2qabL3aLPfRMyA1,196,Hello friend I will like to let you know Leona...,2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
3,R_2qabL3aLPfRMyA1_4,R_2qabL3aLPfRMyA1,259,"Hello friend , I will like to tell you Qatar l...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
4,R_2qabL3aLPfRMyA1_5,R_2qabL3aLPfRMyA1,361,"Dear friend , I will like to know that Trump s...",2.0,6.0,3.0,22.0,100000.0,5.5,4.5,3.5,6.0,6.0,3.714,2.857,2.571,3.429,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,R_1gShpZuf35KXreS_1,R_1gShpZuf35KXreS,73,"Hey , I have always liked Billy Bob Thornton a...",2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,joy
521,R_1gShpZuf35KXreS_2,R_1gShpZuf35KXreS,164,The fact that Donald Trump just peaced out of ...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,anger
522,R_1gShpZuf35KXreS_3,R_1gShpZuf35KXreS,182,I am not okay ! How can anyone harm something ...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,sadness
523,R_1gShpZuf35KXreS_4,R_1gShpZuf35KXreS,260,I do not know anything outside of this article...,2.0,4.0,1.0,28.0,21000.0,7.0,5.0,1.0,5.0,3.5,4.143,3.286,4.429,3.571,sadness
