In [1]:
# PIVIC - "Um modelo computacional para identificação de notícias falsas sobre a Covid-19 no Brasil"
# Code: Machine Learning - Supervised Learning
# Author: Anísio Pereira Batista Filho

In [2]:
##Essentials
import os
import csv
import numpy as np
import pandas as pd
import gensim
import time

from sklearn.decomposition import PCA

In [3]:
pd.set_option("display.max_columns", None)

## Abertura de arquivo e criação do dataframe

In [4]:
df = pd.read_csv('data/corpus_labeled/iguais/bases_tcc/03_geracao_carcteristicas_base.csv', sep=",", low_memory=False)

In [5]:
df['tweet_text_lemmatization'] = df['tweet_text_lemmatization'].map(eval)

In [6]:
df['tweet_text_stemming'] = df['tweet_text_stemming'].map(eval)

In [7]:
coluna_selecionada = df['tweet_text_lemmatization']

### Criação do modelo Word2Vec

In [8]:
start = time.time()
model = gensim.models.Word2Vec(coluna_selecionada,
                 vector_size=100
                 # Size is the length of our vector.
                )

end = round(time.time()-start,2)
print("Esse processo levou ",end,"segundos")

Esse processo levou  0.29 segundos


### Classe criadora de sequências

In [9]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()
                
                
            
        

In [10]:
start = time.time()
sequencer = Sequencer(all_words = [token for seq in coluna_selecionada for token in seq],
              max_words = 5000,
              seq_len = 500,
              embedding_matrix = model.wv
             )

end = round(time.time()-start,2)
print("Esse processo levou ",end,"segundos")

Esse processo levou  39.19 segundos


In [11]:
# But before creating a PCA model using scikit-learn let's create
# vectors for our each vector
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in coluna_selecionada])
print(x_vecs.shape)

(3600, 50000)


In [12]:
pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.998551387436596


In [13]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(3600, 50)

### Salvando modelo Word2Vec

In [16]:
X = pd.DataFrame(x_comps)

In [17]:
X.to_csv("data/corpus_labeled/iguais/bases_tcc/05_word2vec_model_creation_base.csv", sep=",", index=False)