In [5]:
# pip install PyPDF2
# pip install pdfplumber

In [25]:
import pdfplumber
from pdfminer.layout import LAParams
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

qa_tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
qa_model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [26]:
# pdfplumber

def extract_text_from_pdf(pdf_path):
    laparams = LAParams(line_margin=0.1)  # Adjust line margin to help with word separation
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text(x_tolerance=2, y_tolerance=3, laparams=laparams)  # Fine-tune tolerances
    return text

# pdf_path = 'D:/dell data/rutgers/nlp/slides/slide 10 - transformers.pdf'
# pdf_text = extract_text_from_pdf(pdf_path)

In [6]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+|Header text|Footer text', '', text)
    return text.strip()

In [18]:
def split_text_with_sentence_overlap(text, chunk_size=512):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > chunk_size:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [current_chunk[-1]]
                current_length = len(current_chunk[0].split())
        
        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [19]:
def summarize_text(text):
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = summarizer_model.generate(inputs['input_ids'], max_length=250, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [11]:
def generate_qa(context):
    inputs = qa_tokenizer(context, return_tensors="pt")
    outputs = qa_model.generate(
        **inputs,
        max_length=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )
    return qa_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [12]:
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [13]:
def compute_similarity(qa_pair, context):
    embeddings = [get_bert_embeddings(text) for text in [context, qa_pair]]
    return cosine_similarity(embeddings[0], embeddings[1])[0][0]

In [28]:
pdf_path = 'D:/dell data/rutgers/data viz/assignment5/9.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(pdf_text)
sections = split_text_with_sentence_overlap(cleaned_text)

In [29]:
sections

['Speech and Language Processing. Daniel Jurafsky & James H. Martin. Copyright © 2024. All rights reserved. Draft of August 20, 2024. CHAPTER 9 The Transformer “The true art of memory is the art of attention ” Samuel Johnson, Idler #74, September 1759 In this chapter we introduce the transformer, the standard architecture for build- ing large language models. Transformer-based large language models have com- pletely changed the field of speech and language processing. Indeed, every subse- quent chapter in this textbook will make use of them. We’ll focus for now on left- to-right (sometimes called causal or autoregressive) language modeling, in which we are given a sequence of input tokens and predict output tokens one by one by conditioning on the prior context. The transformer is a neural network with a specific structure that includes a mechanismcalledself-attentionormulti-headattention.1 Attentioncanbethought of as a way to build contextual representations of a token’s meaning by at

In [30]:
qa_results = []
for section in sections:
    summarized_section = summarize_text(section)
    qa_pair = generate_qa(summarized_section)
    similarity = compute_similarity(qa_pair, summarized_section)
    qa_results.append({"summary": summarized_section, "qa_pair": qa_pair, "similarity": similarity})

In [31]:
for result in qa_results:
    print(f"Summary: {result['summary']}")
    print(f"QA Pair: {result['qa_pair']}")
    print(f"Relevance Score: {result['similarity']:.4f}")
    print("-" * 50)

Summary: Speech and Language Processing. Daniel Jurafsky & James H. Martin. Draft of August 20, 2024. CHAPTER 9 The Transformer “The true art of memory is the art of attention ” Samuel Johnson, Idler #74, September 1759.
QA Pair: What is the true art of memory? the art of attention
Relevance Score: 0.7084
--------------------------------------------------
Summary: In Chapter 23, we look at how language models are pretrained and how tokens are generated via sampling. Chapter 11 introduces masked language modeling and the BERT family of bidirectional transformer encoder models. Chapter 13 will introduce machine translation with the encoder-decoder architecture.
QA Pair: In Chapter 23, we look at how language models are pretrained and how tokens are generated via what? sampling
Relevance Score: 0.8649
--------------------------------------------------
Summary: Transformers can build contextual representations of word meaning, contex- contextual tual embeddings, by integrating the meaning 

In [69]:
qa_results

[{'summary': 'ELMo representations are a function of all internal layers of biLM. Linear combination of the vectors stacked above each input word Improves performance over just using the top LSTM layerTransfer Learning Natural Language Processing with Deep Learning CS224N/Ling284, Chris Manning, StanfordULMfit Universal Language Model.'},
 {'summary': 'Intuition: a representation of meaning of a word should be different in different contexts. Each word has a different vector that expresses different meanings depending on the surrounding words. We say that a word "attends to" some neighboring words more than others.'},
 {'summary': 'We’ve given the intuition of self-attention (as a way to compute representations of a) The core intuition of attention is the idea of comparing an item of interest to a word at a given layer by integrating information from words at the previous layer. For example, returning Chapter 9.2, the computation of a is based on a set of comparisons between the 3 coll

In [81]:
import pandas as pd

df = pd.read_csv("hf://datasets/soufyane/DATA_SCIENCE_QA/data (1).csv")

In [85]:
df

Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,What is under-fitting and overfitting in machi...,"Underfitting is when a model is too simple, an..."
1,1,Can you explain what a false positive and a fa...,A false positive incorrectly indicates a condi...
2,2,Clarify the concept of Phase IV.,"Phase IV studies, also known as post-marketing..."
3,3,What is semi-supervised learning described in ...,Semi-supervised learning integrates both label...
4,4,Discuss the parallelization of training in gra...,Parallelizing training of a gradient boosting ...
...,...,...,...
1065,1065,Define the ACID property in SQL and its signif...,ACID principles maintain database integrity by...
1066,1066,What are the different types of data warehouses?,"Data warehouses vary by scope and function, wi..."
1067,1067,What are the key stages in a data mining project?,A data mining project starts with understandin...
1068,1068,What is information extraction?,Information extraction systematically identifi...


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
text = "Scikit-learn supports K-means clustering."
print(ner_pipeline(text))


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[]


In [2]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
text = "Scikit-learn supports K-means clustering."
print(ner_pipeline(text))

[]
