In [None]:
!pip install gensim



In [None]:
! pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [118]:
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import PyPDF2
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [102]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [None]:
# Load the ELMO model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
def extract_text_from_pdf(pdf_file):

    """Extracts text from a PDF file.

    Args:
        pdf_file: The path to the PDF file.

    Returns:
        A string containing the extracted text.

    """

    text = ""
    with open(pdf_file, 'rb') as pdf_reader:
        reader = PyPDF2.PdfReader(pdf_reader)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [None]:
# Load the PDF file

pdf_file = "/content/sample_data/Comparativestudyofwordembeddingalgorithm.pdf"
text = extract_text_from_pdf(pdf_file)

In [None]:
text

'ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 112 (2017)  340–349\n1877-0509 © 2017 The Authors. Published by Elsevier B.V.\nPeer-review under responsibility of KES International\n10.1016/j.procs.2017.08.009\n10.1016/j.procs.2017.08.009© 2017 The Authors. Published by Elsevier B.V .\nPeer-review under responsibility of KES International\n1877-0509Available online at www.sciencedirect.com\nProcedia Computer Science 00 (2017) 000–000\nwww.elsevier.com/ locate /procedia\nInternational Conference on Knowledge Based and Intelligent Information and Engineering\nSystems, KES2017, 6-8 September 2017, Marseille, France\nComparative study of word embedding methods in topic\nsegmentation\nMarwa Naili∗, Anja Habacha Chaibi, Henda Hajjami Ben Ghezala\nRIADI laboratory, National School of computer Science (ENSI),\nUniversity of Mannouba 2010, Tunisia\nAbstract\nThe vector representations of words are very useful in di ﬀerent natural language processing tasks in

In [None]:
# Access the appropriate signature for embedding
embed_fn = elmo.signatures['default']

# Embed the text using the embed_fn. Pass text as positional argument
embeddings = embed_fn(tf.constant([text]))['elmo']

In [None]:
print(embeddings)


tf.Tensor(
[[[-0.5185611  -0.5409341   0.07054564 ...  0.78896505 -0.12693155
   -0.33742332]
  [-0.06468721  0.9545266   0.20303619 ... -0.37890467 -0.7762271
   -0.41686863]
  [ 0.09154911  0.17872748 -0.15596014 ... -0.5123843  -0.467335
   -0.3965827 ]
  ...
  [-1.0815887  -0.80924284  0.43927002 ...  0.66765636 -0.23668984
   -0.68361384]
  [-0.7567288  -0.30398047 -0.2940536  ...  0.21035153  0.80204123
   -1.4575136 ]
  [-0.44408035  0.7996168  -0.7304639  ... -0.8548847  -0.16584936
   -0.9443433 ]]], shape=(1, 11382, 1024), dtype=float32)


In [None]:
# advanced way to convert pdf into cleaned multiple chunks


In [105]:
def extract_text_from_pdf(pdf_file):

    """Extracts text from a PDF file.

    Args:
        pdf_file: The path to the PDF file.

    Returns:
        A string containing the extracted text.

    """

    text = ""
    with open(pdf_file, 'rb') as pdf_reader:
        reader = PyPDF2.PdfReader(pdf_reader)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [106]:
def preprocess_text(text):

    """Preprocesses text by removing punctuation, stop words, stemming, and lemmatization.

    Args:
        text: The text to preprocess.

    Returns:
        A list of preprocessed tokens.
    """

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove punctuation
    tokens = [word.lower() for word in tokens if word.isalnum()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stem or lemmatize words
    # Choose either stemming or lemmatization based on your preference
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [107]:
def chunk_text(text, chunk_size=300):

    """Chunks the text into smaller segments based on sentence boundaries.

    Args:
        text: The preprocessed text.
        chunk_size: The desired maximum size of each chunk.

    Returns:
        A list of tuples, each containing a chunk and its starting index in the original text.
    """

    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    start_index = 0
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            chunks.append((" ".join(current_chunk), start_index))
            current_chunk = []
            start_index += len(" ".join(current_chunk)) + 1  # Account for the space between sentences
        current_chunk.append(sentence)
    if current_chunk:
        chunks.append((" ".join(current_chunk), start_index))
    return chunks

In [167]:
def embed_pdf(pdf_file, elmo):

    """Embeds a PDF document using ELMO.

    Args:
        pdf_file: The path to the PDF file.
        wv: The Word2Vec model.

    Returns:
        A list of tuples, each containing a chunk embedding and its starting index in the original text.
    """

    text = extract_text_from_pdf(pdf_file)
    chunks = chunk_text(text) # Pass the original 'text'

    chunk_embeddings = []
    for chunk, start_index in chunks:
        chunk_tokens = preprocess_text(chunk)
        # Use the 'elmo' object to generate embeddings
        chunk_embedding = elmo.signatures["default"](tf.constant([chunk]))['elmo']  # Assuming the model has a 'default' signature
        chunk_embedding = np.mean(chunk_embedding, axis=0)  # Calculate the average vector

        # Added a check if chunk_embedding is not None:
        if chunk_embedding is not None:
            chunk_embeddings.append((chunk_embedding, start_index))

    return chunk_embeddings , chunks

In [168]:
# Load the PDF file

pdf_file = "/content/sample_data/Comparativestudyofwordembeddingalgorithm.pdf"

In [170]:
# Embed the PDF

chunk_embeddings,chunks = embed_pdf(pdf_file, elmo)

In [171]:
len(chunks)

24

In [172]:
chunks[0]

('ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 112 (2017)  340–349\n1877-0509 © 2017 The Authors. Published by Elsevier B.V.\nPeer-review under responsibility of KES International\n10.1016/j.procs.2017.08.009\n10.1016/j.procs.2017.08.009© 2017 The Authors. Published by Elsevier B.V .',
 0)

In [173]:
chunks[23]

('Approach Arabic topic segmenter WindowDiﬀ\nEndogenous ArabC991436.10%\nArabTextTiling1482.5%\nExogenous ToSe-LSA 29.9%\nToSe-Word2Vec 29.52%\nToSe-GloVe 28.43%\nBased on this evaluation, we can conclude that exogenous topic segmenters are much way better than endogenous\ntopic segmenters for both Arabic and English languages. This can be explained by the fact that adding external\nknowledge enhances the quality of topic segmentation. Furthermore, we notice that prediction-based embedding\nmethods improve topic segmentation.6. Conclusions\nIn this paper, we investigated topic segmentation by using word embedding as representational basis. For this\nreason we used the well known methods: LSA, Wor2Vec and GloVe. The aim of this study is to identify which method\nis more eﬀective to learn word vector representations that provide the semantic meaning of words for both English\nand Arabic languages. Yet, compared to other methods, Word2Vec is the most complicate one because of its di ﬀeren

In [174]:
len(chunk_embeddings)

24

In [175]:
chunk_embeddings[0][0]

array([[-0.5185611 , -0.5409341 ,  0.07054564, ...,  0.87855375,
        -0.15627274, -0.33428386],
       [-0.06468721,  0.9545266 ,  0.20303619, ..., -0.3320887 ,
        -0.8282112 , -0.34949598],
       [ 0.09154911,  0.17872748, -0.15596014, ..., -0.4465177 ,
        -0.5123299 , -0.36933926],
       ...,
       [-0.2739171 , -0.2760446 ,  0.34321856, ..., -0.36861664,
         0.21666196, -1.2448204 ],
       [-0.32666162, -0.31426767, -0.35524338, ..., -0.11997335,
        -0.49596033, -0.17764708],
       [-0.5422448 , -0.38210356, -0.7529255 , ..., -0.080252  ,
         0.03607737,  0.11280698]], dtype=float32)

In [176]:
chunk_embeddings[23][0]

array([[-0.18196136, -0.15308137,  0.35556495, ..., -0.00455654,
        -0.20667076, -0.22041866],
       [-0.38237062,  0.368627  ,  0.36678028, ..., -0.3826543 ,
         0.48623466, -0.38074473],
       [-0.14679319,  0.37798053, -0.07863915, ..., -0.19774854,
         0.10006976,  0.02729857],
       ...,
       [-1.1159974 , -0.783383  ,  0.4140091 , ...,  0.6676561 ,
        -0.23669007, -0.6836139 ],
       [-0.725764  , -0.29312533, -0.30130523, ...,  0.21035105,
         0.80204165, -1.4575149 ],
       [-0.4280275 ,  0.8132723 , -0.7421303 , ..., -0.8548844 ,
        -0.16584948, -0.94434357]], dtype=float32)

In [177]:
chunk_embeddings

[(array([[-0.5185611 , -0.5409341 ,  0.07054564, ...,  0.87855375,
          -0.15627274, -0.33428386],
         [-0.06468721,  0.9545266 ,  0.20303619, ..., -0.3320887 ,
          -0.8282112 , -0.34949598],
         [ 0.09154911,  0.17872748, -0.15596014, ..., -0.4465177 ,
          -0.5123299 , -0.36933926],
         ...,
         [-0.2739171 , -0.2760446 ,  0.34321856, ..., -0.36861664,
           0.21666196, -1.2448204 ],
         [-0.32666162, -0.31426767, -0.35524338, ..., -0.11997335,
          -0.49596033, -0.17764708],
         [-0.5422448 , -0.38210356, -0.7529255 , ..., -0.080252  ,
           0.03607737,  0.11280698]], dtype=float32),
  0),
 (array([[-0.76515806, -0.05140942,  0.1016481 , ...,  0.5739049 ,
           0.27230364, -0.465231  ],
         [-0.630808  , -0.7205056 , -0.63053197, ..., -0.65991485,
           0.3434443 , -0.07031652],
         [-0.34732002, -0.6586375 , -0.33295983, ..., -0.61724716,
           0.41304094, -0.38706958],
         ...,
         [-0.

In [178]:
def find_similar_paragraphs(query, chunk_embeddings, top_n=3):
    """Finds the most similar paragraphs to a query.

    Args:
        query: The query text.
        chunk_embeddings: A list of tuples, each containing a chunk embedding and its starting index.
        top_n: The number of most similar paragraphs to return.

    Returns:
        A list of tuples, each containing a similar paragraph and its starting index in the original text.
    """
    # Convert the query to a tensor before passing it to the ELMo model
    query_tensor = tf.constant([query])

    # Pass the input as a dictionary to the SavedModel signature
    query_embedding = elmo.signatures["default"](text=query_tensor)["elmo"].numpy()[0]

    # Reshape embeddings to 2D by averaging along sequence length dimension before padding
    # Modification: Ensure reshaped_embeddings is a 2D numpy array
    # Correctly extract and reshape embeddings, assuming they are the first element of each tuple and are 3D
    reshaped_embeddings = np.array([np.mean(embedding[0], axis=0) for embedding in chunk_embeddings])

    # **Ensure query_embedding is 2D as well**
    query_embedding = np.mean(query_embedding, axis=0).reshape(1, -1) # Reshape to (1, embedding_dim)

    # Calculate similarities and get top indices using reshaped embeddings
    # Ensure both inputs to cosine_similarity are 2D arrays
    similarities = cosine_similarity(query_embedding, reshaped_embeddings)[0]  # Pass query_embedding directly
    top_indices = similarities.argsort()[-top_n:][::-1]

    return [chunk_embeddings[i] for i in top_indices]

In [179]:
# Example usage:

query = "What are Word2Vectors?"

similar_chunks = find_similar_paragraphs(query, chunk_embeddings, top_n=2)
for chunk_embedding, start_index in similar_chunks:
    print(f"Chunk starting at index {start_index}: {chunk_embedding}")

Chunk starting at index 3: [[ 0.55567    -0.10357156 -0.09091133 ... -0.1921097  -0.16911836
  -0.3021638 ]
 [ 0.15349415 -0.1058057  -0.0350397  ... -0.43798766  0.32634234
   0.02079748]
 [ 0.11167665 -0.1210684   0.65949196 ... -0.08617122  0.3678538
  -0.88598096]
 ...
 [-0.08763288  0.33711058  0.5016888  ...  0.3158012  -0.08167155
  -0.76893723]
 [ 0.04882842  0.17753729 -0.56318796 ...  0.6333717  -0.06196661
  -0.08275204]
 [-0.05533883  0.45851326 -0.5346844  ...  0.49408522 -0.6061249
   0.06973888]]
Chunk starting at index 2: [[ 0.55567    -0.10357156 -0.09091133 ... -0.1921097  -0.16911836
  -0.3021638 ]
 [ 0.15349415 -0.1058057  -0.0350397  ... -0.43798766  0.32634234
   0.02079748]
 [ 0.11167665 -0.1210684   0.65949196 ... -0.08617122  0.3678538
  -0.88598096]
 ...
 [-0.08763288  0.33711058  0.5016888  ...  0.3158012  -0.08167155
  -0.76893723]
 [ 0.04882842  0.17753729 -0.56318796 ...  0.6333717  -0.06196661
  -0.08275204]
 [-0.05533883  0.45851326 -0.5346844  ...  0.49

In [181]:
chunks[3]

('Published by Elsevier B.V.\nPeer-review under responsibility of KES International.Available online at www.sciencedirect.com\nProcedia Computer Science 00 (2017) 000–000\nwww.elsevier.com/ locate /procedia\nInternational Conference on Knowledge Based and Intelligent Information and Engineering\nSystems, KES2017, 6-8 September 2017, Marseille, France\nComparative study of word embedding methods in topic\nsegmentation\nMarwa Naili∗, Anja Habacha Chaibi, Henda Hajjami Ben Ghezala\nRIADI laboratory, National School of computer Science (ENSI),\nUniversity of Mannouba 2010, Tunisia\nAbstract\nThe vector representations of words are very useful in di ﬀerent natural language processing tasks in order to capture the semantic\nmeaning of words. In this context, the three known methods are: LSA, Word2Vec and GloVe. In this paper, these methods will\nbe investigated in the ﬁeld of topic segmentation for both languages Arabic and English. Moreover, Word2Vec is studied in depth\nby using diﬀerent m

In [182]:
chunks[2]

('Published by Elsevier B.V.\nPeer-review under responsibility of KES International.Available online at www.sciencedirect.com\nProcedia Computer Science 00 (2017) 000–000\nwww.elsevier.com/ locate /procedia\nInternational Conference on Knowledge Based and Intelligent Information and Engineering\nSystems, KES2017, 6-8 September 2017, Marseille, France\nComparative study of word embedding methods in topic\nsegmentation\nMarwa Naili∗, Anja Habacha Chaibi, Henda Hajjami Ben Ghezala\nRIADI laboratory, National School of computer Science (ENSI),\nUniversity of Mannouba 2010, Tunisia\nAbstract\nThe vector representations of words are very useful in di ﬀerent natural language processing tasks in order to capture the semantic\nmeaning of words. In this context, the three known methods are: LSA, Word2Vec and GloVe. In this paper, these methods will\nbe investigated in the ﬁeld of topic segmentation for both languages Arabic and English. Moreover, Word2Vec is studied in depth\nby using diﬀerent m

In [183]:
# Example usage:

query = "Evaluation of Word2Vec parameters"

similar_chunks = find_similar_paragraphs(query, chunk_embeddings, top_n=2)
for chunk_embedding, start_index in similar_chunks:
    print(f"Chunk starting at index {start_index}: {chunk_embedding}")

Chunk starting at index 21: [[-0.18196136 -0.15308137  0.35556495 ... -0.01119897 -0.18268442
  -0.22321653]
 [-0.38237062  0.368627    0.36678028 ... -0.39189094  0.49804285
  -0.3891988 ]
 [-0.14679319  0.37798053 -0.07863915 ... -0.2098047   0.11431462
   0.00982361]
 ...
 [ 0.30335885  0.27720588 -0.11012789 ... -1.6538963   0.28629324
  -0.22746965]
 [ 0.17726679  0.56010616  0.1456542  ...  0.1119245  -0.26514205
   0.20545009]
 [-0.14412907  0.7589239  -0.3291497  ... -0.21720408  0.6805029
   0.20901448]]
Chunk starting at index 23: [[-0.18196136 -0.15308137  0.35556495 ... -0.00455654 -0.20667076
  -0.22041866]
 [-0.38237062  0.368627    0.36678028 ... -0.3826543   0.48623466
  -0.38074473]
 [-0.14679319  0.37798053 -0.07863915 ... -0.19774854  0.10006976
   0.02729857]
 ...
 [-1.1159974  -0.783383    0.4140091  ...  0.6676561  -0.23669007
  -0.6836139 ]
 [-0.725764   -0.29312533 -0.30130523 ...  0.21035105  0.80204165
  -1.4575149 ]
 [-0.4280275   0.8132723  -0.7421303  ... -

In [184]:
chunks[21]

('Approach Arabic topic segmenter WindowDiﬀ\nEndogenous ArabC991436.10%\nArabTextTiling1482.5%\nExogenous ToSe-LSA 29.9%\nToSe-Word2Vec 29.52%\nToSe-GloVe 28.43%\nBased on this evaluation, we can conclude that exogenous topic segmenters are much way better than endogenous\ntopic segmenters for both Arabic and English languages. This can be explained by the fact that adding externalknowledge enhances the quality of topic segmentation. Furthermore, we notice that prediction-based embedding\nmethods improve topic segmentation.6. Conclusions\nIn this paper, we investigated topic segmentation by using word embedding as representational basis. For this\nreason we used the well known methods: LSA, Wor2Vec and GloVe. The aim of this study is to identify which method\nis more eﬀective to learn word vector representations that provide the semantic meaning of words for both English\nand Arabic languages. Yet, compared to other methods, Word2Vec is the most complicate one because of its di ﬀerent\

In [185]:
chunks[23]

('Approach Arabic topic segmenter WindowDiﬀ\nEndogenous ArabC991436.10%\nArabTextTiling1482.5%\nExogenous ToSe-LSA 29.9%\nToSe-Word2Vec 29.52%\nToSe-GloVe 28.43%\nBased on this evaluation, we can conclude that exogenous topic segmenters are much way better than endogenous\ntopic segmenters for both Arabic and English languages. This can be explained by the fact that adding external\nknowledge enhances the quality of topic segmentation. Furthermore, we notice that prediction-based embedding\nmethods improve topic segmentation.6. Conclusions\nIn this paper, we investigated topic segmentation by using word embedding as representational basis. For this\nreason we used the well known methods: LSA, Wor2Vec and GloVe. The aim of this study is to identify which method\nis more eﬀective to learn word vector representations that provide the semantic meaning of words for both English\nand Arabic languages. Yet, compared to other methods, Word2Vec is the most complicate one because of its di ﬀeren