In [1]:
read_file = "../data/"
write_file = "../data/"

with open("../data/index.cfg", "r") as config_file:
    for line in config_file.readlines():
        instruction, filename = line.split("=")
        filename = filename.strip()
        
        if instruction == "LEIA":
            read_file += filename
        elif instruction == "ESCREVA":
            write_file += filename

## Definições de termos

* N - número total de documentos na coleção
* nj - número de documentos que contém um termo j
* tf(i,j) - a frequência "bruta" de um termo j em um documento i
* t - quantidade de termos distintos na coleção

In [2]:
from tqdm.notebook import tqdm, trange
import time
import pandas as pd
import numpy as np

def get_term_document_matrix():
    inverted_list = pd.read_csv('../data/tokens.csv', sep=';', converters={"Appearance": pd.eval})
    matrix = pd.DataFrame(inverted_list["Token"])
    matrix.set_index(["Token"], inplace=True)
    shape = (matrix.shape[0], 1)
    for token, docs in inverted_list.itertuples(index=False):
        for doc in docs:
            if str(doc) in matrix.columns:
                matrix.at[token, str(doc)] += 1
            else:
                zeros = pd.DataFrame(np.zeros(shape), index=inverted_list["Token"], columns=[str(doc)])
                matrix = pd.concat([matrix, zeros], axis=1)
                matrix.at[token, str(doc)] = 1
    return matrix

In [3]:
def get_n(matrix):
    N = len(matrix.columns)
    return N

In [4]:
def get_nj(token, matrix):
    row = matrix.loc[token]
    return row.astype(bool).sum()  # Tudo que tiver algum valor diferente de zero será 1, 
                                   # e então somamos todas as colunas

In [5]:
def get_tf(token, document, matrix):
    return int(matrix.loc[token, str(document)])

In [6]:
def get_tfn(token, document, matrix):
    tf = get_tf(token, document, matrix)
    biggest_tf = int(matrix.loc[:, str(document)].max())
    return tf / biggest_tf

In [7]:
from math import log10

def get_idf(token, matrix):
    return log10(get_n(matrix) / get_nj(token, matrix))

In [8]:
def get_wij(token, document, matrix, normalized=True):
    if normalized:
        tf = get_tf(token, document, matrix)
    else:
        tf = get_tfn(token, document, matrix)
        
    wij = tf * get_idf(token, matrix)
    return wij

In [14]:
def get_model(matrix):
    weights = matrix.copy()
    for token in tqdm(weights.index):
        idf = get_idf(token, weights)
        for document in weights.columns:
            tf = get_tf(token, document, matrix)
            wij = tf * idf
            weights.loc[token, str(document)] = wij
    return weights

In [10]:
def save_matrix(path):
    matrix = get_term_document_matrix()
    matrix.to_csv(path, sep=";")

In [11]:
save_matrix("../data/matriz.csv")

In [15]:
def save_model(path):
    matrix = get_term_document_matrix()
    model = get_model(matrix)
    model.to_csv(path, sep=";")

In [16]:
save_model("../data/model.csv")

  0%|          | 0/6278 [00:00<?, ?it/s]