# 1. Bibliotecas, paquetes y funciones

In [2]:
import pandas as pd
import re
from transformers import pipeline, TFAutoModelForTokenClassification, AutoTokenizer




In [3]:
def extraer_entidades(texto, ner_model):
    # Run the NER
    ner_output = ner_model(texto)
    
    entities = []
    i = 0
    while i < len(ner_output):
        ent = ner_output[i]
        if ent['entity'] == 'B-PER' or ent['entity'] == 'I-PER':
            # Start building the word
            word = ent['word']
            i += 1
            # Continue building the word if it starts with ##
            while i < len(ner_output) and ner_output[i]['word'].startswith('##'):
                word += re.sub('##', '', ner_output[i]['word'])
                i += 1
            entities.append(word)
        else:
            i += 1
    
    # Get unique entities
    unique_entities = list(set(entities))
    
    return unique_entities

def extraer_paises(texto, country_lst):
    paises = []
    for ent in ner_model(texto):
        if ent['entity'] == 'B-LOC' or ent['entity'] == 'I-LOC':
            if ent['word'] in country_lst:
                paises.append(ent['word'])
    return list(set(paises))

# 2. Ingeniería de datos

In [31]:
#read json.dumps file as df
path = r"C:\Users\asarr\Documents\Projects\llm-linterna-verde\data\results\docs.csv"
monitoreos = pd.read_csv(path)
monitoreos.head()

Unnamed: 0,Monitoreo,Text
0,Report No 5. 0617-0630 2019,June 28 - June 30: Conservatives Blame OSF of ...
1,Report No 6. 0701-0714 2019,July 3: Big Profile Brazilian Conservatives Am...
2,Report No 8. 0729-0811 2019,August 6: Far-Rights Attacks to OSF From Europ...
3,Report No 9. 0812-0825 2019,August 23: An Old Conspiratorial Piece Denounc...
4,Report No 10. 0826-0908 2019,August 26: Christian Groups Point Out at Mr. S...


In [33]:
monitoreos.shape

(82, 2)

## Fechas

In [34]:
month_lst = ['june', 'july', 'august', 'september', 'october', 'november', 'december', 'january', 'february', 'march', 'april', 'may']

tracker = len(month_lst)

while tracker < 64:
    month_lst += month_lst
    tracker = len(month_lst)

month_lst = month_lst[:64]
len(month_lst)

64

In [35]:
month_lst_2 = ['september', 'october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august']

tracker = len(month_lst)
while tracker < 82:
    month_lst += month_lst_2
    tracker = len(month_lst)

month_lst = month_lst[:82]

In [36]:
monitoreos['month'] = month_lst
monitoreos

Unnamed: 0,Monitoreo,Text,month
0,Report No 5. 0617-0630 2019,June 28 - June 30: Conservatives Blame OSF of ...,june
1,Report No 6. 0701-0714 2019,July 3: Big Profile Brazilian Conservatives Am...,july
2,Report No 8. 0729-0811 2019,August 6: Far-Rights Attacks to OSF From Europ...,august
3,Report No 9. 0812-0825 2019,August 23: An Old Conspiratorial Piece Denounc...,september
4,Report No 10. 0826-0908 2019,August 26: Christian Groups Point Out at Mr. S...,october
...,...,...,...
77,Soros Monitor 3.0 - No. 14,"in Brazil, NGO work is questioned over OSF’s f...",october
78,Soros Monitor 3.0 - No. 15,Perú The Peruvian Prosecutor’s Office has orde...,november
79,Soros Monitor 3.0 - No. 16,echoes of US student protests in Latam Last Ap...,december
80,Soros Monitor 3.0 - No. 17,Soros-naming is used to disregard human rights...,january


In [53]:
year = [2019] * 12 + [2020] * 27 + [2021] * 12 + [2022] * 13 + [2023] * 5 + [2024] * 12 + [2023]

monitoreos['year'] = year
monitoreos

Unnamed: 0,Monitoreo,Text,month,year
0,Report No 5. 0617-0630 2019,June 28 - June 30: Conservatives Blame OSF of ...,june,2019
1,Report No 6. 0701-0714 2019,July 3: Big Profile Brazilian Conservatives Am...,july,2019
2,Report No 8. 0729-0811 2019,August 6: Far-Rights Attacks to OSF From Europ...,august,2019
3,Report No 9. 0812-0825 2019,August 23: An Old Conspiratorial Piece Denounc...,september,2019
4,Report No 10. 0826-0908 2019,August 26: Christian Groups Point Out at Mr. S...,october,2019
...,...,...,...,...
77,Soros Monitor 3.0 - No. 14,"in Brazil, NGO work is questioned over OSF’s f...",october,2024
78,Soros Monitor 3.0 - No. 15,Perú The Peruvian Prosecutor’s Office has orde...,november,2024
79,Soros Monitor 3.0 - No. 16,echoes of US student protests in Latam Last Ap...,december,2024
80,Soros Monitor 3.0 - No. 17,Soros-naming is used to disregard human rights...,january,2024


## Reconocimiento de entidades

### Personas

In [54]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = TFAutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

ner_model = pipeline('ner', model=model, tokenizer=tokenizer)

monitoreos['entities'] = monitoreos['Text'].apply(lambda x: extraer_entidades(x, ner_model))




All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


### Paises

In [None]:
countries = ['Cuba', 'Venezuela', 'Montenegro', 'France', 'Perú', 'Peru', 'India', 'United States', 'US', 'UK', 'Colombia',
             'Mexico', 'Brazil', 'Argentina', 'Chile', 'Ecuador', 'Bolivia', 'Paraguay', 'Uruguay', 'Guyana', 'Suriname',
             'Salvador', 'Honduras', 'Nicaragua', 'Costa Rica', 'Panama', 'Jamaica', 'Haiti', 'Dominican Republic', 'Puerto Rico', 'USA',
             'Russia', 'China', 'Spain' 'Korea', 'Palestine', 'Italy', 'Guatemala', 'Israel', 'Brasil', 'Georgia', 'Ukraine']

monitoreos['Countries'] = monitoreos['Text'].apply(lambda x: extraer_paises(x, countries))
monitoreos.head()

Unnamed: 0,Monitoreo,Text,entities,Countries
0,Report No 10. 0826-0908 2019.pdf,August 26: Christian Groups Point Out at Mr. S...,"[Carvalho, David, Sorosive, Francis, Soros, Ju...","[Honduras, Peru, Colombia, Mexico]"
1,Report No 11. 0909-0921 2019.pdf,September 10: Former Brazilian Legislator Jean...,"[Jean, ##querdizada, Greta, Soros, Wyllys, Tru...","[China, Brazil]"
2,Report No 12. 0922-1006 2019.pdf,September 23-26: Attacks to Greta Thunberg by ...,"[”, Jair, Greta, Spike, Marie, Soros, Ernesto,...","[Brazil, Mexico]"
3,Report No 13. 1007-1020 2019.pdf,"October 11, October 20: Mr. Soros’ Alleged Rol...",[Soros],"[Cuba, Ecuador, Colombia, Chile, Venezuela]"
4,Report No 14. 1021-1104 2019.pdf,October 27-31: Colombian Hard-Right See a Mr. ...,"[Iván, Duque, Uribe, Ávaro, Soros, Daniel, U, ...","[Argentina, Colombia, Chile]"


## Reconocimiento de temas con topic modelling