In [27]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import keras
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

# Sample corpus
documents = ['Machine learning is the study of computer algorithms that improve automatically through experience.\
Machine learning algorithms build a mathematical model based on sample data, known as training data.\
The discipline of machine learning employs various approaches to teach computers to accomplish tasks \
where no fully satisfactory algorithm is available.',
'Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\
The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
'Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. \
It involves computers learning from data provided so that they carry out certain tasks.',
'Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"\
or "feedback" available to the learning system: Supervised, Unsupervised and Reinforcement',
'Software engineering is the systematic application of engineering approaches to the development of software.\
Software engineering is a computing discipline.',
'A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concerned\
about the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.\
Developing a machine learning application is more iterative and explorative process than software engineering.'
]

documents_df=pd.DataFrame(documents,columns=['documents'])

# removing special characters and stop words from the text
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(documents_df.documents_cleaned)
tfidf_vectors=tfidfvectoriser.transform(documents_df.documents_cleaned)

pairwise_similarities=np.dot(tfidf_vectors,tfidf_vectors.T).toarray()
pairwise_differences=euclidean_distances(tfidf_vectors)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')   

print (tfidf_vectors[0].toarray())
print (pairwise_similarities.shape)
print (pairwise_similarities[0][:])



Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.22860560787391593


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.22581304743529423


Document: Machine learning approaches are traditionally divided

[nltk_data] Downloading package stopwords to /Users/cero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
vocab_size=len(tokenizer.word_index)+1
print (tokenized_paded_documents[0])

[ 2  1 10 11 12 20 21 22  2  1 12 23 13 24 14 25  4 26 27  4 15 16  2  1
 28 29  7 30  5 31  8 32 33 34 17  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [3]:
import os
import gzip

# Nombre del archivo comprimido que quieres cargar
nombre_archivo_comprimido = "GoogleNews-vectors-negative300.bin.gz"

# Obtener la ruta del directorio actual de trabajo
directorio_actual = os.getcwd()

# Ruta completa al archivo comprimido
ruta_completa_comprimida = os.path.join(directorio_actual, nombre_archivo_comprimido)

# Ruta al archivo descomprimido (sin la extensión .gz)
ruta_descomprimida = ruta_completa_comprimida[:-3]

# Verificar si el archivo comprimido existe
if os.path.exists(ruta_completa_comprimida):
    # Descomprimir el archivo .bin.gz si no existe el archivo descomprimido
    if not os.path.exists(ruta_descomprimida):
        print("Descomprimiendo el archivo...")
        try:
            with gzip.open(ruta_completa_comprimida, 'rb') as archivo_comprimido:
                contenido_comprimido = archivo_comprimido.read()
            with open(ruta_descomprimida, 'wb') as archivo_descomprimido:
                archivo_descomprimido.write(contenido_comprimido)
            print("Descompresión completa.")
        except gzip.BadGzipFile:
            print("El archivo descargado no está en formato gzip. Verifica la integridad del archivo.")
else:
    print("El archivo comprimido no se encontró en el directorio actual de trabajo.")


In [4]:

from gensim.models import KeyedVectors
t_model = KeyedVectors.load_word2vec_format(ruta_descomprimida, binary=True)
print("Modelo Word2Vec cargado exitosamente.")

Modelo Word2Vec cargado exitosamente.


In [19]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in t_model:
        embedding_matrix[i]=t_model[word]
# creating document-word embeddings
document_word_embeddings=np.zeros((len(tokenized_paded_documents),64,300))
for i in range(len(tokenized_paded_documents)):
    for j in range(len(tokenized_paded_documents[0])):
        document_word_embeddings[i][j]=embedding_matrix[tokenized_paded_documents[i][j]]
document_word_embeddings.shape

(6, 64, 300)

In [24]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Crear y ajustar el vectorizador TF-IDF
tfidfvectorizer = TfidfVectorizer()
tokenized_paded_documents_list = [' '.join(map(str, tokens)) for tokens in tokenized_paded_documents]
tfidf_vectors = tfidfvectorizer.fit_transform(tokenized_paded_documents_list)

# Calculando embeddings de documentos
document_embeddings = np.zeros((len(tokenized_paded_documents), 300))
words = tfidfvectorizer.get_feature_names_out()

for i in range(len(tokenized_paded_documents)):
    for j in range(len(words)):
        if j in tokenizer.index_word:
            word = tokenizer.index_word[j]  # Assuming index_word exists
            document_embeddings[i] += embedding_matrix[tokenizer.word_index[word]] * tfidf_vectors[i, j]

print(document_embeddings.shape)

# Calculando similitudes y distancias entre pares
pairwise_similarities = cosine_similarity(document_embeddings)
pairwise_differences = euclidean_distances(document_embeddings)

# Encontrando documentos más similares
most_similar(0, pairwise_similarities, 'Cosine Similarity')
most_similar(0, pairwise_differences, 'Euclidean Distance')


(6, 300)
Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.8006697835271764


Document: A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concernedabout the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.Developing a machine learning application is more 

In [31]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Cargar los documentos y tokenizarlos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents = tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents = pad_sequences(tokenized_documents, maxlen=64, padding='post')

# Obtener el tamaño del vocabulario
vocab_size = len(tokenizer.word_index) + 1

# Cargar los vectores de palabras pre-entrenados desde el archivo GloVe
embeddings_index = {}
with open('glove.6B.100d.txt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Crear la matriz de incrustaciones
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Calcular embeddings de documentos ponderados por TF-IDF
document_embeddings = np.zeros((len(tokenized_paded_documents), 100))
words = tfidfvectoriser.get_feature_names_out()

for i in range(len(tokenized_paded_documents)):
    for j in range(len(words)):
        if words[j] in tokenizer.word_index:
            word_index = tokenizer.word_index[words[j]]
            embedding_vector = embedding_matrix[word_index]
            tfidf_value = tfidf_vectors[i, j]
            document_embeddings[i] += embedding_vector * tfidf_value

print(document_embeddings.shape)

# Calcular similitudes y distancias entre pares
pairwise_similarities = cosine_similarity(document_embeddings)
pairwise_differences = euclidean_distances(document_embeddings)

# Encontrar los documentos más similares
most_similar(0, pairwise_similarities, 'Cosine Similarity')
most_similar(0, pairwise_differences, 'Euclidean Distance')


(6, 100)
Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.945627697475007


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.9311285719195275


Document: A software engineer creates programs based on l

In [32]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents_cleaned)]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)
    
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')

[nltk_data] Downloading package punkt to /Users/cero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Software engineering is the systematic application of engineering approaches to the development of software.Software engineering is a computing discipline.
Cosine Similarity : 0.27748628960897265


Document: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
Cosine Similarity : 0.2604616989808754


Document: Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"or "feedback" a

  document_embeddings[i]=model_d2v.docvecs[i]


In [34]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

document_embeddings = sbert_model.encode(documents_df['documents_cleaned'])

pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')

  from .autonotebook import tqdm as notebook_tqdm


Document: Machine learning is the study of computer algorithms that improve automatically through experience.Machine learning algorithms build a mathematical model based on sample data, known as training data.The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available.


Similar Documents:


Document: Machine learning is closely related to computational statistics, which focuses on making predictions using computers.The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.
Cosine Similarity : 0.8365410566329956


Document: A software engineer creates programs based on logic for the computer to execute. A software engineer has to be more concernedabout the correctness of the program in all the cases. Meanwhile, a data scientist is comfortable with uncertainty and variability.Developing a machine learning application is more iterative

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import os

# Directorio que contiene los archivos de texto
directorio = 'ruta/a/tu/directorio'  # Cambiar a la ubicación adecuada

# Lista para almacenar los documentos
documentos = []

# Iterar sobre los archivos en el directorio
for archivo in os.listdir(directorio):
    if archivo.endswith('.txt'):  # Asegurarse de que solo se toman en cuenta archivos de texto
        with open(os.path.join(directorio, archivo), 'r', encoding='utf-8') as f:
            documentos.append(f.read())

# DataFrame de pandas para almacenar los documentos
documentos_df = pd.DataFrame(documentos, columns=['documentos'])

# Remover caracteres especiales y palabras vacías del texto
stop_words_es = stopwords.words('spanish')
documentos_df['documentos_limpios'] = documentos_df.documentos.apply(lambda x: " ".join(re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ]', ' ', w).lower() for w in x.split() if re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ]', ' ', w).lower() not in stop_words_es))

# Inicializar y ajustar el vectorizador TF-IDF
vectorizador_tfidf = TfidfVectorizer()
vectorizador_tfidf.fit(documentos_df.documentos_limpios)
vectores_tfidf = vectorizador_tfidf.transform(documentos_df.documentos_limpios)

# Calcular la similitud del coseno y la distancia euclidiana entre los vectores TF-IDF
similitudes_coseno = np.dot(vectores_tfidf, vectores_tfidf.T).toarray()
distancias_euclidianas = euclidean_distances(vectores_tfidf)

# Función para encontrar los documentos más similares
def mas_similares(doc_id, matriz_similitud, matriz):
    print (f'Documento: {documentos_df.iloc[doc_id]["documentos"]}')
    print ('\n')
    print ('Documentos Similares:')
    if matriz == 'Similitud del Coseno':
        indices_similares = np.argsort(matriz_similitud[doc_id])[::-1]
    elif matriz == 'Distancia Euclidiana':
        indices_similares = np.argsort(matriz_similitud[doc_id])
    for indice in indices_similares:
        if indice == doc_id:
            continue
        print('\n')
        print (f'Documento: {documentos_df.iloc[indice]["documentos"]}')
        print (f'{matriz} : {matriz_similitud[doc_id][indice]}')

# Mostrar los documentos más similares al primer documento utilizando la similitud del coseno y la distancia euclidiana
mas_similares(0, similitudes_coseno, 'Similitud del Coseno')
mas_similares(0, distancias_euclidianas, 'Distancia Euclidiana')
