In [1]:
import pypdf                # PDF reader
from tqdm import tqdm       # Progress bar bc I'm impatient
import os                   # Navigate folders
import time                 # Timing
import tracemalloc          # Memory Usage

import re                   # Text preprocessing stuff
import string               # More text preprocessing
import nltk                 # Tokenization



from sentence_transformers import SentenceTransformer       # Embedding Model

from collections import Counter                             # Simple counting dictionary


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kibbl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Read in Roland Notes
# reader = pypdf.PdfReader('Roland_Notes.pdf')

# notes = ''

# for page in tqdm(reader.pages):
#     notes += page.extract_text()

In [3]:
# Read in slides
texts = []

for doc in tqdm(os.listdir('Slides')):
    reader = pypdf.PdfReader(f'Slides/{doc}')

    text = ''

    for page in reader.pages:
        text += page.extract_text()

    texts.append(text)

text = ' '.join(texts)

100%|██████████| 13/13 [00:02<00:00,  4.73it/s]


In [4]:
# Helper preprocessing functions

def normalize_text(text, case_senstive=False):

    # Normalizes case if need be
    if case_senstive:
        text = text.lower()

    # Removes whitespace
    text = text.replace('\n', ' ').strip()

    return text

def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    return [token for token in tokens if token.lower() not in stop_words]

In [5]:
def preprocess_text(text, method='word'):

    # Very basic text normalization
    text = normalize_text(text)

    if method == 'word':
    
        # Tokenization
        tokens = nltk.tokenize.word_tokenize(text)

        # Remove stopwords if need be
        tokens = remove_stopwords(tokens)

        # Replaces wacky symbols (like stylized bullets) with <SYM> token if need be
        tokens = ["<SYM>" if re.fullmatch(r"[^\w\d" + re.escape(string.punctuation) + "]", token) else token for token in tokens]

        # Replaces words that show up only once with <UNK> token if need be
        # rare = [item[0] for item in Counter(tokens).items() if item[1] == 1]
        # tokens = ['<UNK>' if token in rare else token for token in tokens]

        # Replaces pure numbers with <NUM> token if need be
        tokens = ['<NUM>' if token.isdigit() else token for token in tokens]

        # Removes punctuation marks
        # tokens = [token for token in tokens if token not in string.punctuation]

    elif method == 'sent':

        # Tokenization
        tokens = nltk.tokenize.sent_tokenize(text)

        # Preprocessing similar to regular word preprocessing if need be
        for i in range(len(tokens)):
            sent = tokens[i]
            sent = ' '.join(preprocess_text(sent))
            tokens[i] = sent
            


    return tokens

# Function for chunking text
def chunk_text(text, chunk_size, overlap=0):
    chunks = []
    for start in range(len(text) // (chunk_size-overlap) + 1):
        chunks.append(text[start * (chunk_size-overlap) : (start+1) * (chunk_size-overlap)])
    return chunks

tokens = preprocess_text(text, 'sent')
chunks = chunk_text(text, 1000)

In [6]:
# Embedding function
def embed_tokens(chunks, model):    
    embeddings = []
    for chunk in tqdm(chunks):
        slay = model.encode(chunks)
        embeddings.append(slay)

    return embeddings

# Test model 1

tracemalloc.start()
start_time = time.time()

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = embed_tokens(chunks[:20], model)


elapsed = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f'Time elapsed: {round(elapsed, 4)} seconds')
print(f"Peak memory usage: {peak / 1024**2:.2f} MiB")

# Test model 2
# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# embeddings = embed_tokens(chunks[:20], model)

# Test model 3
# model = SentenceTransformer("hkunlp/instructor-xl")
# embeddings = []
# for chunk in tqdm(chunks[:10]):
#     instruction = 'Represent the Data Science sentence for retrieval: '
#     corpus = [[instruction, sent] for sent in chunk]
#     embed = model.encode(corpus)

#     embeddings.append(embed)

100%|██████████| 20/20 [02:13<00:00,  6.68s/it]

Time elapsed: 136.1569 seconds
Peak memory usage: 12.43 MiB



