<a href="https://colab.research.google.com/github/anushaaa07/jio_internship/blob/main/w2v_finetuning_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.
# Target audience is the natural language processing (NLP) and information retrieval (IR) community.
# Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
!pip install gensim PyPDF2 transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from google.colab import files
import PyPDF2

# Uploaded the research paper from laptop
uploaded = files.upload()

# Took string text as input, tokenized the input text removing punctuation and then filtered out tokens that are in the predefined list of stopwords

def preprocess(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file: # Opened the PDF file in binary read mode
        # Used PdfReader instead of the deprecated PdfFileReader
        reader = PyPDF2.PdfReader(file)
        text = "" # Initialized an empty string text to store the extracted text
        # Used len(reader.pages) to get the number of pages
        for page_num in range(len(reader.pages)): # Iterating over each page in the PDF
            text += reader.pages[page_num].extract_text() # Extracting text from the current page and appended it to the text string
    return text

# Processed the uploaded file
documents = [] # Initialized an empty list documents to store the preprocessed text of the pdf
for filename in uploaded.keys(): # Iterated over each filename in the uploaded files.
    text = extract_text_from_pdf(filename) # Extracted text from the PDF file using the previously defined function
    documents.append(preprocess(text)) # Preprocesses the extracted text and appended the result to the documents list

print("Preprocessing completed.")

Saving research.pdf to research.pdf
Preprocessing completed.


In [None]:
import gensim.downloader as api

# Loaded the Word2Vec model directly using Gensim
model_name = "word2vec-google-news-300" #Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.
model = api.load(model_name)

print("Pre-trained model loaded.")

Pre-trained model loaded.


In [None]:
from gensim.models import Word2Vec

# Converted tokenized documents back to sentences
sentences = [" ".join(doc) for doc in documents]

# Trained a new Word2Vec model on the research paper
new_model = Word2Vec(sentences=[doc.split() for doc in sentences], vector_size=300, window=5, min_count=1, workers=4)

# Trained the new model, providing the sentences as the corpus_iterable
new_model.train(corpus_iterable=[doc.split() for doc in sentences], total_examples=new_model.corpus_count, epochs=new_model.epochs) # Pass the sentences as corpus_iterable

# Saved the fine-tuned model
new_model.save('word2vec_finetuned.model')

print("Fine-tuning completed.")



Fine-tuning completed.


In [None]:
from numpy import dot # dot product of two arrays.
from numpy.linalg import norm # norm is a way to measure the size of a vector
import numpy as np

# Example fixed sentence used
fixed_sentence = "The Falcon model outperforms previous models achieving 70% accuracy with human evaluation on Spider dataset and it achieves competitive 75% accuracy with human evaluation on WikiSQL dataset."
tokens = preprocess(fixed_sentence)

# Function to get average vector from Word2Vec model
def get_average_vector(model, tokens):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None

# Got vector for the fixed sentence from pre-trained model
baseline_vector = get_average_vector(model, tokens)

# Loaded fine-tuned Word2Vec model
finetuned_model = Word2Vec.load('word2vec_finetuned.model')

# Got average vector for the sentence from fine-tuned model
finetuned_vector = get_average_vector(finetuned_model.wv, tokens)

# Printed the vectors as well as their sizes/dimensions

print("Baseline Vector:", baseline_vector)
print("Dimensions/Size of Baseline Vector:", baseline_vector.shape[0] if baseline_vector is not None else "None")
print("Fine-tuned Vector:", finetuned_vector)
print("Dimensions/Size of Fine-tuned Vector:", finetuned_vector.shape[0] if finetuned_vector is not None else "None")

# Calculated cosine similarity
if baseline_vector is not None and finetuned_vector is not None:
    cosine_similarity = dot(baseline_vector, finetuned_vector) / (norm(baseline_vector) * norm(finetuned_vector))
    print("Cosine Similarity between baseline and fine-tuned vectors:", cosine_similarity)
else:
    print("One or both vectors are None, cannot compute cosine similarity.")

Baseline Vector: [-5.12345247e-02  4.41993251e-02  7.76403071e-03  4.71370928e-02
  4.37909290e-02  1.65692493e-02  8.07333589e-02 -9.72684994e-02
  1.44114777e-01  1.05676986e-01 -9.51645821e-02 -1.50282919e-01
  9.81445312e-02  2.76166126e-02  4.65303287e-03  1.66661873e-01
 -8.50327462e-02  4.46813256e-02 -5.90106733e-02 -1.56120747e-01
 -1.47051647e-01 -4.40027565e-02 -1.69390514e-01  1.23352051e-01
 -2.70367786e-02 -1.05341688e-01 -2.70593971e-01  6.58605248e-02
  1.48064112e-02 -6.98314011e-02 -3.25137861e-02 -1.13913141e-01
 -6.20691627e-02  2.52283886e-02 -9.08992961e-02 -1.28055457e-02
 -5.47377653e-02 -2.59596873e-02  1.89287975e-01  2.68985517e-02
  1.06089875e-01  7.16409087e-02 -5.29856980e-02  2.29391664e-01
 -3.80572141e-03 -1.38420552e-01 -2.95561627e-02  4.36446251e-04
  9.26298276e-02  3.43233012e-02 -3.04170493e-02 -1.89855248e-02
 -7.76151791e-02 -4.99016270e-02 -4.27658968e-02  6.06043190e-02
  3.26430388e-02 -1.42937154e-01  9.02674049e-02  6.89194649e-02
  8.8572