In [1]:
import spacy
import numpy as np
from docx import Document


########################################### 1

nlp = spacy.load("es_core_news_sm")

def getText(filename):
    doc = Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def fixText(txt):
    txt = txt.lower()
    txt = txt.replace(',', '').replace('.', '').replace(')', '').replace('(', '')
    txt = txt.replace(';', '').replace(':', '').replace('?', '').replace('¿', '')
    txt = txt.replace('!', '').replace('%', '').replace('¡', '').replace('-', '')
    txt = txt.replace('»', '').replace('"', '').replace('\n', '')
    for i in range(0,9):
        txt = txt.replace(str(i), '').replace('  ',' ')
        
    doc = nlp(txt)            #eliminate stopwords
    
    filtered_words = [token.text for token in doc if not token.is_stop]
    #txt = " ".join(filtered_words)
    
    txt = filtered_words
    #txt.remove(" ")
    return txt
        
doc_list = ['doc01.docx','doc02.docx','doc03.docx','doc04.docx',
            'doc05.docx','doc06.docx','doc07.docx','doc08.docx',
            'doc09.docx','doc10.docx','doc11.docx','doc12.docx',]

docs = []
for d in range(len(doc_list)):
    docs.append(fixText(getText(doc_list[d])))
    

print(docs[0])


['smartphone', 'alta', 'gama', 'promete', 'revolucionar', 'mercadohoy', 'anunció', 'lanzamiento', 'smartphone', 'reconocida', 'marca', 'tecnophone', 'dispositivo', 'denominado', 'tecnophone', 'xz', 'promete', 'características', 'sobresalientes', 'seguramente', 'causarán', 'revuelo', 'industria', 'móvil', 'incorpora', 'pantalla', 'oled', 'pulgadas', 'procesador', 'generación', 'memoria', 'ram', 'gb', 'amantes', 'fotografía', 'tecnophone', 'xz', 'posee', 'cámara', 'triple', 'resolución', 'megapíxeles', 'expertos', 'calificado', 'móvil', 'año', 'esperan', 'impacto', 'mercado', 'meses']


In [2]:
############################ 2
def get_vocab(docs):
    vocab = set()
    for doc in docs:
        for token in doc:
            vocab.add(token)
    vocab = list(vocab)
    return vocab

vocab = get_vocab(docs)
print('vocabulary size: ' + str(len(vocab)))

def BagofWord(docs, vocab):
    freqword = np.zeros((len(docs),len(vocab)))
    print('matrix size: ' + str(freqword.shape))
    for doc in docs:
        for token in doc:
            i = docs.index(doc)
            j = vocab.index(token)
            freqword[i][j] += 1
        
    return freqword

freqword = BagofWord(docs, vocab)
print(freqword)

#print(len(docs[0]))
#print(np.sum(freqword[0][:]))

vocabulary size: 379
matrix size: (12, 379)
[[0. 0. 0. ... 0. 3. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 1. 0. 1.]]


In [3]:
##############################3

def similitud(freqword):
    cossim = np.zeros((len(docs),len(docs)))
    for i in range(cossim.shape[0]):
        for j in range(cossim.shape[0]):
            cossim[i][j] = -1
    for i in range(freqword.shape[0]):
        for j in range(i+1,freqword.shape[0]):     ##############
            dotpr = np.dot(freqword[i][:],freqword[j][:])
            norm = np.linalg.norm(freqword[i][:]) * np.linalg.norm(freqword[j][:])
            cossim[i][j] = dotpr/norm
    return cossim        #triangular matrix
        
cossim = similitud(freqword)
print(cossim)

[[-1.          0.05357143  0.          0.05718696  0.01976424  0.12134067
   0.0372678   0.0175035   0.          0.0372678   0.15        0.08908708]
 [-1.         -1.          0.10798985  0.          0.          0.07924289
   0.12777531  0.040008    0.          0.10647943  0.21428571  0.10181381]
 [-1.         -1.         -1.          0.          0.          0.03494283
   0.11268723  0.01764187  0.          0.09390603  0.          0.05611959]
 [-1.         -1.         -1.         -1.          0.07233642  0.04229549
   0.          0.1494785   0.21354071  0.          0.13724871  0.27171305]
 [-1.         -1.         -1.         -1.         -1.          0.
   0.          0.19926335  0.17712298  0.02357023  0.09486833  0.02817181]
 [-1.         -1.         -1.         -1.         -1.         -1.
   0.10336228  0.03883678  0.          0.04134491  0.12480754  0.08647909]
 [-1.         -1.         -1.         -1.         -1.         -1.
  -1.          0.02087414  0.          0.04444444  0.149

In [4]:
#################################### 4
flat_matrix = cossim.flatten()

# Get the indices of the largest 5 elements using argsort
largest_indices = flat_matrix.argsort()[-5:][::-1]

# Get the values of the largest 5 elements
largest_values = flat_matrix[largest_indices]

# Calculate the row and column indices from the flattened indices
num_rows, num_cols = cossim.shape
row_indices = largest_indices // num_cols
col_indices = largest_indices % num_cols
largest_elements = [(value, row, col) for value, row, col in zip(largest_values, row_indices, col_indices)]

# Print the largest 5 elements and their indices
print("5 most similar pairs:")
for i, (value, row, col) in enumerate(largest_elements):
    print(f"value = {value}, position = ({row},{col})")
    
print('\n')
print("MOST SIMILAR TEXTS")
print('\n')
print(getText(doc_list[largest_elements[0][1]]))
print(getText(doc_list[largest_elements[0][2]]))

############################
    
# Get the indices of the smallest 5 elements 
positive_matrix = flat_matrix[flat_matrix >= 0]


smallest_indices = positive_matrix.argsort()[:5]
smallest_values = positive_matrix[smallest_indices]
num_rows, num_cols = cossim.shape
row_indices = smallest_indices // num_cols
col_indices = smallest_indices % num_cols
smallest_elements = [(value, row, col) for value, row, col in zip(smallest_values, row_indices, col_indices)]

# Print the largest 5 elements and their indices
print("5 least similar pairs:")
for i, (value, row, col) in enumerate(smallest_elements):
    print(f"value = {value}, position = ({row},{col})")
    
print('\n')
print("LEAST SIMILAR TEXTS")
print('\n')

print(getText(doc_list[smallest_elements[0][1]]))
print(getText(doc_list[smallest_elements[0][2]]))
    


5 most similar pairs:
value = 0.3118671619021332, position = (8,11)
value = 0.31180478223116176, position = (10,11)
value = 0.2717130486464075, position = (3,11)
value = 0.21428571428571427, position = (1,10)
value = 0.21354071087949103, position = (3,8)


MOST SIMILAR TEXTS


Los supercargadores: carga rápida para coches eléctricos

Uno de los mayores desafíos de los vehículos eléctricos ha sido el tiempo de carga. Sin embargo, la empresa PowerCharge ha anunciado una nueva red de supercargadores capaces de cargar un coche eléctrico en solo 20 minutos. Estos supercargadores serán instalados en estaciones de servicio y centros comerciales, permitiendo a los conductores recargar sus vehículos en el tiempo que les lleva tomar un café. Es un paso más hacia la adopción masiva de la movilidad eléctrica.

App móvil revoluciona la carga de vehículos eléctricos en la ciudad

La carga de coches eléctricos en áreas urbanas puede ser un desafío debido a la disponibilidad y distribución de puntos d