# 1. Bibliotecas y paquetes

In [1]:
import pandas as pd
import os
import json

import io
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage
import re

# 2. Funciones

In [2]:
def clean_document_m3(file):
    """ 
    Limpia el texto de un documento del Soros Monitor 3.
        Input: 
            file (str): texto del documento.

        Output:
            doc (str): texto del documento limpio.
    """
    doc = re.sub(r'\n', ' ', file) # Elimina saltos de línea.
    doc = re.sub(r'\s+', ' ', doc) # Elimina espacios en blanco múltiples.
    doc = re.sub(r'http\S+', ' ', doc) # Elimina URLs.
    doc = re.sub(r'.*This Month: ', '', doc) # Elimina texto al inicio del documento.
    doc = re.sub(r'\s*\*\s*\*\s*\*\s*PÁG\.', '', doc) # Elimina texto al final del documento.

    return doc

def clean_document_m2(file):
    """ 
    Limpia el texto de un documento del Soros Monitor 1 y 2.
        Input: 
            file (str): texto del documento.

        Output:
            doc (str): texto del documento limpio.
    """
    doc = re.sub(r'\n', ' ', file) # Elimina saltos de línea.
    doc = re.sub(r'\s+', ' ', doc) # Elimina espacios en blanco múltiples.
    doc = re.sub(r' ●', ' ', doc) # Elimina caracteres especiales.
    doc = re.sub(r' ➔', '.', doc)
    doc = re.sub(r'\u200bEl', '.', doc)
    doc = re.sub(r'Clip\u200b', '', doc)
    doc = re.sub(r'http\S+', ' ', doc) # Elimina URLs.
    doc = re.sub(r'.*Contents', '', doc) # Elimina texto al inicio del documento.
    doc = re.sub(r'^[^a-zA-Z]*', '', doc)
    doc = re.sub(r'\s*\*\s*\*\s*\*\s*', '', doc) # Elimina texto al final del documento.

    return doc

def readPDF(pdfFile):
    #Based on code from http://stackoverflow.com/a/20905381/4955164
    #Using utf-8, if there are a bunch of random symbols try changing this
    codec = 'utf-8'
    rsrcmgr = pdfminer.pdfinterp.PDFResourceManager()
    retstr = io.StringIO()
    layoutParams = pdfminer.layout.LAParams()
    device = pdfminer.converter.TextConverter(rsrcmgr, retstr, laparams = layoutParams)
    #We need a device and an interpreter
    interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device)
    password = ''
    maxpages = 0
    caching = True
    pagenos=set()
    for page in pdfminer.pdfpage.PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    device.close()
    returnedString = retstr.getvalue()
    retstr.close()
    return returnedString

def import_raw_pdf(file_path):
    """
    takes a file path and returns a string of the text in the pdf

    Input must be a natural string (r'path\to\file.pdf')
    """
    with open(file_path, 'rb') as file:
        f = file.read()
    return readPDF(io.BytesIO(f))
    
def further_cleaning(text):
    text = re.sub(r'Soros Monitor 3.0 - No. ', ' ', text)
    text = re.sub(r' v\d', ' ', text)
    try:
        text = int(text)
    except:
        pass
    return text

# 3. Importar texto y limpieza inicial

In [3]:
doc_dict = {'Monitoreo': [], 'Text': []}

## Monitor 1 y 2

In [4]:
# Variables iniciales

file_path = r"C:\Users\asarr\Documents\Projects\llm-linterna-verde\data\raw\viejo"


In [5]:
for file in os.listdir(file_path):
    doc = import_raw_pdf(os.path.join(file_path, file))
    doc = clean_document_m2(doc)
    doc_dict['Monitoreo'].append(file)
    doc_dict['Text'].append(doc)

## Monitor 3

In [6]:
file_path = r"C:\Users\asarr\Documents\Projects\llm-linterna-verde\data\raw\v3"

In [7]:
#importar y procesar archivos

for file in os.listdir(file_path):
    with open(os.path.join(file_path, file), 'r') as f:
        text = f.read()      
    clean_text = clean_document_m3(text)
    doc_dict['Monitoreo'].append(file)
    doc_dict['Text'].append(clean_text)

## Unión

In [64]:
docs = pd.DataFrame(doc_dict)
print(docs.shape)
docs.head()

(82, 2)


Unnamed: 0,Monitoreo,Text
0,Report No 10. 0826-0908 2019.pdf,August 26: Christian Groups Point Out at Mr. S...
1,Report No 11. 0909-0921 2019.pdf,September 10: Former Brazilian Legislator Jean...
2,Report No 12. 0922-1006 2019.pdf,September 23-26: Attacks to Greta Thunberg by ...
3,Report No 13. 1007-1020 2019.pdf,"October 11, October 20: Mr. Soros’ Alleged Rol..."
4,Report No 14. 1021-1104 2019.pdf,October 27-31: Colombian Hard-Right See a Mr. ...


In [65]:
#replace .pdf and .txt
docs['Monitoreo'] = docs['Monitoreo'].str.replace('.txt', '')
docs['Monitoreo'] = docs['Monitoreo'].str.replace('.pdf', '')

In [66]:
#sort docs by monitoreo
docs = docs.sort_values(by='Monitoreo')
docs = pd.concat([docs.iloc[8:12], docs.iloc[:8], docs.iloc[12:]])

docs_v3 = docs.iloc[64:-1]
special_cases = docs.iloc[-1]

docs_v3['Monitoreo'] = docs_v3['Monitoreo'].apply(further_cleaning)
docs_v3 = docs_v3.sort_values(by='Monitoreo')
docs_v3['Monitoreo'] = docs_v3['Monitoreo'].apply(lambda x: 'Soros Monitor 3.0 - No. ' + str(x))

docs = pd.concat([docs.iloc[:64], docs_v3])
docs.loc[-1] = special_cases

docs.reset_index(drop=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  docs_v3['Monitoreo'] = docs_v3['Monitoreo'].apply(further_cleaning)


In [67]:
docs

Unnamed: 0,Monitoreo,Text
0,Report No 5. 0617-0630 2019,June 28 - June 30: Conservatives Blame OSF of ...
1,Report No 6. 0701-0714 2019,July 3: Big Profile Brazilian Conservatives Am...
2,Report No 8. 0729-0811 2019,August 6: Far-Rights Attacks to OSF From Europ...
3,Report No 9. 0812-0825 2019,August 23: An Old Conspiratorial Piece Denounc...
4,Report No 10. 0826-0908 2019,August 26: Christian Groups Point Out at Mr. S...
...,...,...
77,Soros Monitor 3.0 - No. 14,"in Brazil, NGO work is questioned over OSF’s f..."
78,Soros Monitor 3.0 - No. 15,Perú The Peruvian Prosecutor’s Office has orde...
79,Soros Monitor 3.0 - No. 16,echoes of US student protests in Latam Last Ap...
80,Soros Monitor 3.0 - No. 17,Soros-naming is used to disregard human rights...


# 4. Preparar datos para construir el dataset

In [77]:
docs_records = docs.to_dict(orient='records')

In [78]:
path = r"C:\Users\asarr\Documents\Projects\llm-linterna-verde\data\results"

with open(path + '\docs.jsonl', 'w') as f:
    for record in docs_records:
        f.write(json.dumps(record) + '\n')

# 5. Guardar datos

In [68]:
docs.to_csv(r"C:\Users\asarr\Documents\Projects\llm-linterna-verde\data\results\docs.csv", index=False)