## Import libraries

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np

# For preprocessing
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Word2Vec
from gensim.models import Word2Vec

# Metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing function

In [None]:
def clean_text(sentence):
# Define a translation table that maps punctuation characters to None
  table = str.maketrans('', '', string.punctuation)

  # Define a set of stopwords
  stop_words = set(stopwords.words('english'))

  # Remove punctuation
  no_punct = sentence.translate(table)

  # Remove non-alphabetic characters
  words = re.sub(r'\W', ' ', no_punct)

  # Remove numbers
  words = re.sub(r'\d', ' ', words)

  # Convert to lowercase
  lower_case = words.lower()

  # Tokenize the sentence
  tokens = word_tokenize(lower_case)

  # Remove stopwords
  no_stops = [word for word in tokens if not word in stop_words]

  # Join the words back into a single string
  cleaned_sentence = ' '.join(no_stops)

  return cleaned_sentence


## Embedding function

In [None]:
# Function to calculate average word vector
def average_word_vector(tokens, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    ntokens = float(len(vocabulary))

    for token in tokens:
        if token in vocabulary:
            feature_vector = feature_vector + model.wv[token]

    if ntokens:
        feature_vector = feature_vector / ntokens

    return feature_vector

In [None]:
def word_embedding(docname1, docname2):

  # Read files
  try:
      doc1 = pd.read_csv(docname1, encoding='utf-8')
  except UnicodeDecodeError:
      doc1 = pd.read_csv(docname1, encoding='latin-1')
  try:
      doc2 = pd.read_csv(docname2, encoding='utf-8')
  except UnicodeDecodeError:
      doc2 = pd.read_csv(docname2, encoding='latin-1')

  # Clean documents
  clean_doc1 = clean_text(doc1.to_string())
  clean_doc2 = clean_text(doc2.to_string())

  # Tokenize documents
  tokens1 = word_tokenize(clean_doc1)
  tokens2 = word_tokenize(clean_doc2)

  # Create a Word2Vec model
  model = Word2Vec([tokens1, tokens2], min_count=1)

  # Get the vocabulary
  vocabulary = model.wv.index_to_key

  # Get the word embeddings
  embedding1 = average_word_vector(tokens1, model, vocabulary, num_features=100)
  embedding2 = average_word_vector(tokens2, model, vocabulary, num_features=100)

  return model, vocabulary, embedding1, embedding2, tokens1, tokens2

## Similarity metrics function

In [None]:
# Function to calculate similarity metrics
def calculate_similarity(vector1, vector2, tokens1, tokens2):

    # Convert the vectors to sets
    s1 = set(tokens1)
    s2 = set(tokens2)

    # Find the intersection and the union of the two sets
    _intersection = s1.intersection(s2)
    _union = s1.union(s2)

    # Calculate the Jaccard similarity
    jaccard_sim = len(_intersection) / len(_union)

    cosine_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
    euclidean_dist = euclidean_distances(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]

    # Print similarity metrics
    print(f'''----- Scores of similarity between documents: "python" and "java" -----
    ------------------- Cosine similarity:  {cosine_sim} -------------------
    ------------------- Jaccard similarity:  {jaccard_sim} -------------------
    ------------------- Euclidean Distance: {euclidean_dist} ------------------''')

## Task 1

In [None]:
# Read the documents
python = 'python.csv'
python2 = 'python2.csv'
java = 'java.csv'

In [None]:
# Call function for task1
model, vocabulary, embedding1, embedding2, tokens1, tokens2 = word_embedding(python, python2)

for word in vocabulary[0:5]:
    print(f"{word}: {model.wv[word]}")

python: [-0.00077108  0.00051207  0.00520512  0.00910442 -0.0091779  -0.00749649
  0.00668652  0.00967212 -0.00529421 -0.00399131  0.007214   -0.00192312
 -0.00473666  0.00639873 -0.00488523 -0.00205207  0.00294557  0.00065835
 -0.00829692 -0.00992456  0.00719405  0.00520265  0.00701148  0.00048711
  0.00624098 -0.00329236 -0.00126264  0.00548296 -0.0077379  -0.0038523
 -0.00735089 -0.00081551  0.00984462 -0.00743861 -0.00271434 -0.0015621
  0.00806709 -0.00624697 -0.00017764 -0.00533942 -0.00949866  0.00449026
 -0.00884654 -0.00433464  0.0001733  -0.00038111 -0.00779567  0.00937303
  0.005176    0.00926685 -0.0077611   0.0042539  -0.00399676  0.00092271
  0.00821121 -0.00413223  0.00452585 -0.00687609 -0.00407459  0.009441
 -0.00144991  0.00047071 -0.00401306 -0.00788953 -0.00190438  0.002545
 -0.00065995  0.0059661  -0.00323246  0.00256702  0.00517464  0.00849781
 -0.00118983 -0.00935283  0.00470852  0.00087295  0.00745849 -0.00084218
 -0.00285913 -0.00863171 -0.0009456   0.00290662 

## Task 2

In [None]:
# Task2

# Calculate similarity metrics
calculate_similarity(embedding1, embedding2, tokens1, tokens2)

----- Scores of similarity between documents: "python" and "java" -----
    ------------------- Cosine similarity:  0.8409571288941643 -------------------
    ------------------- Jaccard similarity:  0.2765957446808511 -------------------
    ------------------- Euclidean Distance: 0.004803549169020451 ------------------


- cosine similarity is high because it takes in consideration the importance of each word, while the jaccard similarity is lower because it only considers if words occured or not and igores the frequency, importance and context of words, so in this kind of application the cosine similarity is better to be used.

In [None]:
model, vocabulary, embedding1, embedding2, tokens1, tokens2 = word_embedding(python, java)

# Calculate similarity metrics
calculate_similarity(embedding1, embedding2, tokens1, tokens2)

----- Scores of similarity between documents: "python" and "java" -----
    ------------------- Cosine similarity:  0.34333152680708895 -------------------
    ------------------- Jaccard similarity:  0.17407407407407408 -------------------
    ------------------- Euclidean Distance: 0.009474620319046397 ------------------
