#Pipeline Classes - AI Democracy

This notebook is to implement the classes responsible for the preprocessing pipeline, note that it assumes that you already have passed the text to the TypoParser functions in previous notebooks. All these objects are design to work only with the text (X) and label (y) columns.

# Importing main librarie

In [1]:
!pip install -U spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 277kB/s 
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting thinc<8.1.0,>=8.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/61/87/decceba68a0c6ca356ddcb6aea8b2500e71d9bc187f148aae19b747b7d3c/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.2MB/s 
Collecting catalogue<2.1.0,>=2.0.3
  Downloading https://files.pythonhosted.org/packages/9c/10/dbc1203a4b1367c7b02fddf08cb2981d9aa3e688d398f587cea0ab9e3bec/catalogue-2.0.4-py3-none-any.whl
Collecting typer<0.4.0,>=0.3.0
  Downloading https://files.pythonhosted.org/pac

In [2]:
!python -m spacy download pt_core_news_lg

2021-06-10 15:29:05.448256: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Collecting pt-core-news-lg==3.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.0.0/pt_core_news_lg-3.0.0-py3-none-any.whl (578.1MB)
[K     |████████████████████████████████| 578.1MB 27kB/s 
Installing collected packages: pt-core-news-lg
Successfully installed pt-core-news-lg-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')


In [3]:
import pandas as pd
import re
import nltk
import numpy as np
import pickle
import spacy
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec, KeyedVectors

# Loading the Data

In [4]:
with open('no_typos.pkl', 'rb') as f:
    df = pickle.load(f)

X = df['Text']

#I don't have the classification yet, so
#y = df['classification'] #Binary 0 - interrupt; 1 - continuity

# Tokenizer

In [5]:
class CustomTokenizer():
    
    def __init__(self, mappers='', custom_specials='default'):
        if custom_specials == 'default':
            self.custom_specials = "!\"#$%&'()*+,¸./:;<=>?@[\]^_`{|}-–⎯—«»´°‘’…~ªº€0123456789"
        else:
            self.custom_specials = custom_specials
        
        if mappers:
            with open(mappers, 'rb') as f:
                mappers = pickle.load(f)
                self.person_map = mappers[0]
                self.party_map = mappers[1]
    
    #Find the Names and Parties of politicians and make them as a unique token
    #Ex.: "Inês de Sousa Real" --> ["Inês de Sousa Real"], not ["Inês", "de", "Sousa", "Real"]
    def fit(self, dataframe):
        person_mapper = {}
        for person in pd.Series(dataframe['Person'].unique()).to_list():
            person_mapper[''.join(person.lower().split())] = person
        
        party_mapper = {}
        for party in pd.Series(dataframe['Party'].unique()).to_list():
            party = str(party)
            party_mapper[''.join(party.lower().split())] = party
        
        mappers = (person_mapper, party_mapper)

        self.person_map = person_mapper
        self.party_map = party_mapper

        with open('mappers.pkl', 'wb') as f:
            pickle.dump(mappers, f)

    def remove_specials_chars(self, text):
        for special_char in self.custom_specials:
            text = text.replace(special_char, ' ')
        text = text.replace('CDS PP', 'CDS-PP')
        return text
    
    #Apply the mapper, so a name becomes a single concatenated lowered string
    #ex.: "Inês de Sousa Real" --> "inesdesousareal"
    def apply_mappers(self, text):

        for person in self.person_map:
             text = text.replace(self.person_map[person], person)

        for party in self.party_map:
            party = str(party)
            text = text.replace(self.party_map[party], party)
        
        return text

    def convert_text(self, text):
        #converts to lowercase and split the words
        text = text.lower()
        words = text.split()
        
        return words
    
    def transform(self, X):
        X = X.apply(self.remove_specials_chars)
        X = X.apply(self.apply_mappers)
        X = X.apply(self.convert_text)
        return X


In [6]:
tokenizer = CustomTokenizer()
tokenizer.fit(df)
X = tokenizer.transform(X)

In [7]:
X[0]

['dirijo',
 'um',
 'abraço',
 'a',
 'todos',
 'neste',
 'regresso',
 'dos',
 'plenários',
 'à',
 'casa',
 'da',
 'democracia',
 'esperávamos',
 'que',
 'nesta',
 'altura',
 'já',
 'pudéssemos',
 'ter',
 'regras',
 'mais',
 'flexíveis',
 'mas',
 'infelizmente',
 'os',
 'números',
 'e',
 'as',
 'consequências',
 'concretas',
 'não',
 'nos',
 'permitem',
 'tal',
 'e',
 'portanto',
 'continuamos',
 'no',
 'essencial',
 'com',
 'as',
 'regras',
 'que',
 'presidiram',
 'aos',
 'últimos',
 'plenários',
 'da',
 'sessão',
 'legislativa',
 'srs',
 'deputados',
 'da',
 'nossa',
 'ordem',
 'do',
 'dia',
 'constam',
 'declarações',
 'políticas',
 'porém',
 'antes',
 'disso',
 'a',
 'sr',
 'secretária',
 'mariadaluzrosinha',
 'fará',
 'o',
 'favor',
 'de',
 'anunciar',
 'a',
 'entrada',
 'de',
 'algumas',
 'iniciativas',
 'tem',
 'a',
 'palavra',
 'sr',
 'secretária']

# Stopwords

In [8]:
class StopwordsParser():

    def __init__(self, stopwords_file=''):
        self.stopwords = open(stopwords_file, 'r').read().splitlines()
        
    def fit(self):
        pass

    def remove_stopwords(self, text):
        text = [token for token in text if token not in self.stopwords]
        return text

    def transform(self, X):
        X = X.apply(self.remove_stopwords)
        return X


In [10]:
stopwords_parser = StopwordsParser('complete_stopwords_set.txt')
X = stopwords_parser.transform(X)

In [11]:
X[0]

['dirijo',
 'abraço',
 'todos',
 'neste',
 'regresso',
 'plenários',
 'casa',
 'democracia',
 'esperávamos',
 'nesta',
 'altura',
 'pudéssemos',
 'ter',
 'regras',
 'flexíveis',
 'infelizmente',
 'números',
 'consequências',
 'concretas',
 'permitem',
 'tal',
 'portanto',
 'continuamos',
 'essencial',
 'regras',
 'últimos',
 'plenários',
 'sessão',
 'legislativa',
 'ordem',
 'dia',
 'constam',
 'declarações',
 'políticas',
 'porém',
 'antes',
 'disso',
 'secretária',
 'mariadaluzrosinha',
 'fará',
 'favor',
 'anunciar',
 'entrada',
 'algumas',
 'iniciativas',
 'palavra',
 'secretária']

# Lemmatizer

In [12]:
class CustomLemmatizer():
    def __init__(self, mappers=''):
        self.nlp = spacy.load('pt_core_news_lg',
                              exclude=['attribute_ruler', 'tok2vec', 'morphologizer',
                                       'parser', 'senter', 'ner', 'attribute_ruler'])
        self.nlp.max_length = 6136000

        if mappers:
            with open(mappers, 'rb') as f:
                mappers = pickle.load(f)
                self.person_map = mappers[0]
                self.party_map = mappers[1]

    def fit(self):
        pass
    
    def undo_mapping(self, tokens):
        #Deixando nomes de pessoas como tokens legiveis novamente
        for i, word in enumerate(tokens):
            if word in self.person_map:
                tokens[i] = self.person_map[word]
            elif word in self.party_map:
                tokens[i] = self.party_map[word]
        return tokens
    
    def normalize_tokens(self, tokens):
        meaningful_string = ' '.join(tokens)
        spacy_object = self.nlp(meaningful_string)
        normalized_tokens = [token.lemma_ for token in spacy_object]
        return normalized_tokens

    def transform(self, X):
        X = X.apply(self.normalize_tokens)
        X = X.apply(self.undo_mapping)
        return X

In [13]:
lemmatizer = CustomLemmatizer('mappers.pkl')
X = lemmatizer.transform(X)

In [14]:
X[0]

['dirigir',
 'abraçar',
 'todo',
 'neste',
 'regressar',
 'plenário',
 'casar',
 'democracia',
 'esperar',
 'nesta',
 'altura',
 'poder',
 'ter',
 'regrar',
 'flexível',
 'infelizmente',
 'número',
 'consequência',
 'concreto',
 'permitir',
 'tal',
 'portanto',
 'continuar',
 'essencial',
 'regrar',
 'último',
 'plenário',
 'sessão',
 'legislativo',
 'ordem',
 'dia',
 'constar',
 'declaração',
 'político',
 'porém',
 'antar',
 'disso',
 'Secretário',
 'Maria da Luz Rosinha',
 'fazer',
 'favor',
 'anunciar',
 'entrar',
 'algum',
 'iniciativo',
 'palavra',
 'Secretário']

# Embeddings:

In [15]:
class CustomEmbeddings():
    def __init__(self, model='', vector_size=20, window_size=2):
        if model:
            self.model = KeyedVectors.load_word2vec_format(model)
        self.vector_size = vector_size
        self.window_size = window_size

    #If no model was given, then apply doc2vec as default
    def fit(self, X):
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X)]
        self.model = Doc2Vec(
            documents=documents,
            vector_size=self.vector_size,
            window=self.window_size,
        )

    def save_model(self, document_name):
        self.model.save(document_name)

    def transform(self, X):
        X = X.apply(self.model.infer_vector)
        return X

In [16]:
embedding = CustomEmbeddings()
embedding.fit(X)
X = embedding.transform(X)
embedding.save_model('embedding_model.txt')

In [17]:
X[0]

array([ 0.07701786, -0.03248294, -0.19406416, -0.14431924, -0.10805328,
       -0.02573857,  0.07646266, -0.09554859, -0.05867622,  0.07883334,
       -0.19267483,  0.03703659, -0.11316131, -0.31990945, -0.05868788,
        0.09510308, -0.04018577,  0.14918438,  0.34055197, -0.20006648],
      dtype=float32)