# **Tratamiento de Datos, Encoding y similitud usando BERT-FINBERT**

## **0 - IMPORT y Objetos Auxiliares:**

### Imports:

In [1]:
# BASICS:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import re
import string
from string import digits
import time

# PROCESADO DE TEXTO
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

#!pip install cyhunspell
from hunspell import Hunspell # SOLO LINUX

# AUXILIARES
import os
import joblib
import tqdm
import contextlib
import pandarallel
pandarallel.pandarallel.initialize()

# MODELOS y ML:
import transformers
import finbert_embedding as femb
import torch

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
# ENTORNO:
t_ini = time.time()

# Número de nucleos disponibles por el ordenador:
cores = max(1, os.cpu_count() - 1)
print(cores)

11


### Downloads:

In [3]:
# Downloads de NLTK -  Solo en caso de no tenerlas ya
already_downloaded = True

if already_downloaded:
    None
else:
    nltk.download('averaged_perceptron_tagger')
    nltk.download('punkt')
    nltk.download('words'); 
    nltk.download('wordnet'); 
    nltk.download('omw-1.4'); 
    nltk.download('brown'); 
    nltk.download('movie_reviews'); 
    nltk.download('punkt'); 
    nltk.download('treebank'); 
    nltk.download('stopwords');
    

    

### F. Auxiliares y otros objetos:

Para poder ver la barra de evolución en el multiproceso

In [4]:
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def _call_(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super()._call_(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

## **1 - Lectura y análisis inicial de los datos**

In [5]:
raw_data = pd.read_pickle("../../data/filings/dataframe_10k_20_21_22.pkl")

In [6]:
raw_data.head()

Unnamed: 0,ticker,link_to_html,item_1,filing_date
0,NPHC,https://www.sec.gov/Archives/edgar/data/111964...,Item 1. Business Introduction We were incorpo...,2021
1,CACC,https://www.sec.gov/Archives/edgar/data/885550...,"ITEM 1. BUSINESS General Since 1972, Credit A...",2020
2,HII,https://www.sec.gov/Archives/edgar/data/150158...,ITEM 1. BUSINESS History and Organization Hun...,2020
3,HR,https://www.sec.gov/Archives/edgar/data/899749...,Item 1. Business Healthcare Realty Trust Inco...,2020
4,MNKPF,https://www.sec.gov/Archives/edgar/data/156789...,Item 1. Business. Overview We are a global bu...,2020


**Existen fillings duplicados. Los eliminamos**

In [7]:
num_10k_ticker = pd.pivot_table(raw_data[["ticker", "filing_date"]], index="ticker", values="filing_date", aggfunc="count")
num_10k_ticker.sort_values("filing_date", ascending=False, inplace=True)
num_10k_ticker.where(num_10k_ticker.values > 3).dropna()

Unnamed: 0_level_0,filing_date
ticker,Unnamed: 1_level_1
SIPN,14.0
CNCN,11.0
PBAJ,9.0
SVMB,9.0
CGSI,8.0
...,...
IPTK,4.0
GNRS,4.0
CWGL,4.0
ADN,4.0


In [8]:
raw_data_unique = raw_data.drop_duplicates(subset=["ticker", "filing_date"])
raw_data_unique.shape

(15842, 4)

In [9]:
num_10k_ticker_unique = pd.pivot_table(raw_data_unique[["ticker", "filing_date"]], index="ticker", values="filing_date", aggfunc="count")
num_10k_ticker_unique.sort_values("filing_date", ascending=False, inplace=True)
num_10k_ticker_unique.where(num_10k_ticker.values > 3).dropna()

Unnamed: 0_level_0,filing_date
ticker,Unnamed: 1_level_1
KBSR,3.0
MSN,3.0
MTCH,3.0
MTBC,3.0
MTB,3.0
...,...
ONEM,3.0
ONDS,3.0
ONCT,3.0
ONB,3.0


### Separamos los datos por año

In [10]:
# separamos por año y eliminamos la columna fillin_date

raw_2020 = raw_data_unique[raw_data_unique["filing_date"] == "2020"].iloc[:, :-1]
raw_2021 = raw_data_unique[raw_data_unique["filing_date"] == "2021"].iloc[:, :-1]
raw_2022 = raw_data_unique[raw_data_unique["filing_date"] == "2022"].iloc[:, :-1]

In [11]:
# Los guardamos:

raw_2020.to_pickle("../../data/filings/processed/raw_2020.pkl")
raw_2021.to_pickle("../../data/filings/processed/raw_2021.pkl")
raw_2022.to_pickle("../../data/filings/processed/raw_2022.pkl")

In [12]:
print(f' El año 2020 tiene {raw_2020.shape[0]} ficheros 10-K')
print(f' El año 2021 tiene {raw_2021.shape[0]} ficheros 10-K')
print(f' El año 2022 tiene {raw_2022.shape[0]} ficheros 10-K')


 El año 2020 tiene 4703 ficheros 10-K
 El año 2021 tiene 5296 ficheros 10-K
 El año 2022 tiene 5843 ficheros 10-K


### Texto en Frases:

In [13]:
corpus = [2020, 2021, 2022]

In [14]:
# Pasar textos a frases
def frasear_textos(corpus):
    return nltk.tokenize.sent_tokenize(corpus)

In [15]:
sentences = []
for year in corpus:
    longitud = globals()[f"raw_{year}"].shape[0]
    textos = globals()[f"raw_{year}"].item_1
    
    # CÁLCULO CON MULTIPROCESO
    with tqdm_joblib(tqdm.tqdm(desc="Pasar a Frases", total=longitud)) as progress_bar:
        resultados = joblib.Parallel(n_jobs=cores)(joblib.delayed(frasear_textos)
                                                (corpus = i) for i in textos)
    

    globals()[f"raw_sentences_{year}"] = pd.DataFrame((globals()[f"raw_{year}"].ticker.values,
                                                       resultados)).transpose()
    globals()[f"raw_sentences_{year}"].columns = ['ticker','item_1']

Pasar a Frases:   0%|          | 0/4703 [00:07<?, ?it/s]
Pasar a Frases:   0%|          | 0/5296 [00:05<?, ?it/s]
Pasar a Frases:   0%|          | 0/5843 [00:06<?, ?it/s]


In [16]:
raw_sentences_2020 # Check

Unnamed: 0,ticker,sentences
0,CACC,"[ ITEM 1. BUSINESS General Since 1972, Credit ..."
1,HII,[ ITEM 1. BUSINESS History and Organization Hu...
2,HR,[ Item 1. Business Healthcare Realty Trust Inc...
3,MNKPF,"[ Item 1. Business., Overview We are a global ..."
4,TRCK,"[ Item 1. Business Track Group, Inc., (the Com..."
...,...,...
4698,ALTX,"[ Item 1. Business., Altex Industries, Inc. (o..."
4699,ACST,[ Item 1. Business Overview We are a biopharm...
4700,SNOA,[ ITEM 1. Business Corporate Information We ...
4701,ROL,"[ Item 1. Business General Rollins, Inc. (the ..."


In [17]:
# Guardamos
raw_sentences_2020.to_pickle("../../data/filings/processed/raw_sentences_2020.pkl")
raw_sentences_2021.to_pickle("../../data/filings/processed/raw_sentences_2021.pkl")
raw_sentences_2022.to_pickle("../../data/filings/processed/raw_sentences_2022.pkl")

### Limpieza inicial de los textos del año 2022

#### STOPWORDS, PUNTUACIÓN Y MAYÚSCULAS:

In [18]:
def limpieza_basica(corpus, is_string=False, is_list=False):
    '''
    Función que limpia un corpus (grupo de textos) y lo devuelve.
    - Quita puntuación
    - Quita dígitos
    - Transforma a minúsculas
    - Elimina espacios vacíos al principio y final
    - Elimina stopwords
    
    INPUT: Serie(columna) o lista de textos.
    OUTPUT: Idem
    '''
    
    import string
    from string import digits
    from nltk.corpus import wordnet, stopwords
    if is_list:
        corpus=pd.Series(corpus)
    
    if is_string: # Si se le pasa una cadena de texto solo
        # Copia para emplazar los datos limpios:
        corp_copy = corpus

        # Eliminación de puntuación:
        corp_copy = corp_copy.translate(str.maketrans('', '',string.punctuation))

        # Eliminación de cifras:
        corp_copy = corp_copy.translate(str.maketrans('', '', digits))

        # Minúsculas:
        corp_copy = corp_copy.lower()

        # Limpieza de huecos al principio y final:
        corp_copy = corp_copy.strip()

        # Delimitación de stopwords inglesas
        stop_words = set(stopwords.words('english'))

        # Eliminación de Stopwords:
        corp_copy = ' '.join([word for word in corp_copy.split() if word not in (stop_words)])
    
    else: # Si se le pasa una columna
        
        # Copia para emplazar los datos limpios:
        corp_copy = corpus.copy()

        # Eliminación de puntuación:
        corp_copy = corp_copy.str.translate(str.maketrans('', '',string.punctuation))

        # Eliminación de cifras:
        corp_copy = corp_copy.str.translate(str.maketrans('', '', digits))

        # Minúsculas:
        corp_copy = corp_copy.str.lower()

        # Limpieza de huecos al principio y final:
        corp_copy = corp_copy.str.strip()

        # Delimitación de stopwords inglesas
        stop_words = set(stopwords.words('english'))

        # Eliminación de Stopwords:
        corp_copy = corp_copy.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    return corp_copy

In [19]:
# LIMPIEZA DE DATOS:
clean = raw_data.copy()
for year in corpus:
    globals()[f"clean_{year}"] = globals()[f"raw_{year}"].copy()
    globals()[f"clean_sentences_{year}"] = globals()[f"raw_sentences_{year}"].copy()
    
    globals()[f"clean_{year}"].item_1 = limpieza_basica(globals()[f"clean_{year}"].item_1)
    globals()[f"clean_sentences_{year}"].item_1 = globals()[f"clean_sentences_{year}"].item_1.apply(
        lambda x: list(limpieza_basica(x,False,True)))
    
    

In [20]:
print(clean_2022.shape)
clean_2022.head()

(5843, 3)


Unnamed: 0,ticker,link_to_html,item_1
13,FSP,https://www.sec.gov/Archives/edgar/data/103131...,item business history company franklin street ...
14,NEWH,https://www.sec.gov/Archives/edgar/data/137112...,item business overview developer clean energy ...
15,NSP,https://www.sec.gov/Archives/edgar/data/100075...,item business general provide array human reso...
16,MDV,https://www.sec.gov/Archives/edgar/data/164587...,item business company modiv internallymanaged ...
28,AKRO,https://www.sec.gov/Archives/edgar/data/174465...,item businessoverviewwe clinicalstage company ...


#### Limpieza de palabras usando un diccionario

In [21]:
h = Hunspell()

clean_2022["item_1"] = clean_2022["item_1"].apply(
    lambda x: ' '.join(
        [word for word in x.split() 
        if h.spell(word)]
        )
    )

Aplicamos a todos:

In [22]:
def check_word(textos):
    import hunspell
    h = hunspell.Hunspell()
    return ' '.join([word for word in textos.split()
             if h.spell(word)])

In [23]:
for year in corpus:
    
    globals()[f"clean_{year}"].item_1 = globals()[f"clean_{year}"].item_1.apply(
        lambda x: ' '.join(
            [word for word in x.split()
             if h.spell(word)]))
    

    globals()[f"clean_sentences_{year}"].item_1.apply(
        lambda x: list(pd.Series(x).apply(
            lambda y: ' '.join(
                [word for word in y.split()
                 if h.spell(word)]))))

#### LEMATIZACIÓN:

In [24]:
# Para que el multiproceso funcione le tienes que pasar todo lo que necesita.
# Esto incluye las funciones (si son más de una) y a veces los import.

def lemmatize_sentence(sentence):
    # IMPORTS:
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet, stopwords
    
    # INICIALIZAR LEMATIZADOR:
    lemmatizer = WordNetLemmatizer()
    
    
    # CONVERSIÓN DE TAG A OBJETO WORDNET:
    def nltk_pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None

    # PROCESO INICIAL:
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    # BUCLE DE LIMPIEZA DE TEXTO
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [25]:
# EJEMPLO DE TEXTO LEMATIZADO
lemmatize_sentence(clean_2022["item_1"].iloc[0])

'item business history company franklin street property corp refer corp company corporation operate manner intend qualify real estate investment trust federal income tax purpose common stock trade symbol corp successor franklin street partner limit partnership partnership originally form general partnership successor general partnership form partnership convert corp refer conversion result conversion partnership cease exist succeed business partnership conversion unit general limited partnership interest partnership convert one share common stock result conversion hold directly indirectly interest three former subsidiary partnership investment property management holding operate business subsidiary business focus commercial real estate investment primarily office market currently operate one segment real estate operation principal revenue source real estate operation include rental income real estate lease interest income secure loan make office property property disposition fee income

In [26]:
for year in corpus:
    print(year)
    full = globals()[f"clean_{year}"].item_1
    sentences = globals()[f"clean_sentences_{year}"].item_1
    
    with tqdm_joblib(tqdm.tqdm(desc=f"Lematizando_full_{year}", total=full.shape[0])) as progress_bar:
        resultados_full = joblib.Parallel(n_jobs=cores)(joblib.delayed(lemmatize_sentence)
                                                        (sentence = i) for i in full)
    
    with tqdm_joblib(tqdm.tqdm(desc=f"Lematizando_frase_{year}", total=sentences.shape[0])) as progress_bar:
        resultados_sentences = joblib.Parallel(n_jobs=cores)(joblib.delayed(lambda x: list(pd.Series(x).apply(lemmatize_sentence)))
                                                        (x = i) for i in sentences)
    
    globals()[f"clean_{year}"].item_1 = resultados_full
    globals()[f"clean_sentences_{year}"].item_1 = resultados_sentences


2020


Lematizando_full_2020:   0%|          | 0/4703 [02:13<?, ?it/s]
Lematizando_frase_2020:   0%|          | 0/4703 [04:07<?, ?it/s]


2021


Lematizando_full_2021:   0%|          | 0/5296 [02:42<?, ?it/s]
Lematizando_frase_2021:   0%|          | 0/5296 [05:04<?, ?it/s]


2022


Lematizando_full_2022:   0%|          | 0/5843 [03:15<?, ?it/s]
Lematizando_frase_2022:   0%|          | 0/5843 [05:58<?, ?it/s]


In [27]:
# Guardamos
clean_2020.loc[:, clean_2021.columns != 'link_to_html'].to_pickle("../../data/filings/processed/clean_2020.pkl")
clean_2021.loc[:, clean_2021.columns != 'link_to_html'].to_pickle("../../data/filings/processed/clean_2021.pkl")
clean_2022.loc[:, clean_2021.columns != 'link_to_html'].to_pickle("../../data/filings/processed/clean_2022.pkl")

clean_sentences_2020.to_pickle("../../data/filings/processed/clean_sentences_2020.pkl")
clean_sentences_2021.to_pickle("../../data/filings/processed/clean_sentences_2021.pkl")
clean_sentences_2022.to_pickle("../../data/filings/processed/clean_sentences_2022.pkl")

In [28]:
t_tot = time.time() - t_ini

print(f'Se han tardado {round(t_tot/60,2)} minutos.')

Se han tardado 27.22 minutos.
