In [1]:
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 1.77MB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [01:04<00:00, 6.83MB/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.40MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 78.3kB/s]


In [38]:
text = """Machine Translation — Machine translation, sometimes referred to by the abbreviation MT, is a sub-field of computational linguistics that investigates the use of software to translate text or speech from one language to another. Google’s language translator is a good example that uses neural machine translation to translate text in one language to every possible language!
Text Summarization —Text Summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.
Sentiment Analysis — Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.
Text Classification —Text Classification is the task of assigning predefined categories to free-text documents. It can provide conceptual views of document collections and has important applications in the real world. Text classification helps to assign predefined categories to a document to help you find the information you need or simplify some activities. For example, an application of text classification is spam filtering in email."""

sentences = sent_tokenize(text)

In [45]:
tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
len(tokenized_sentences[7])

17

In [46]:
max_len = 0
for i in tokenized_sentences:
    if len(i) > max_len:
        max_len = len(i)

padded_sentences = []
for i in tokenized_sentences:
    while len(i) < max_len:
        i.append(0)
    padded_sentences.append(i)

input_ids = torch.tensor(padded_sentences)

In [47]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]

sentence_embeddings = []
for i in range(len(sentences)):
    sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

[array([-1.75942227e-01,  1.58388957e-01, -1.02147460e-01, -2.45035231e-01,
         1.82655975e-01, -1.42343104e-01,  1.76205873e-01,  4.41955388e-01,
        -3.41456197e-03,  1.00367993e-01, -2.51272410e-01, -2.89686590e-01,
        -3.94230425e-01,  1.12142004e-01, -2.30996892e-01,  4.71301496e-01,
         2.04180628e-01, -5.80609962e-02, -2.45629735e-02,  1.67173117e-01,
        -1.81305096e-01,  1.83153063e-01,  3.76091735e-03,  3.22551221e-01,
         4.21469808e-01, -1.75586239e-01, -5.78855090e-02,  3.94118756e-01,
        -2.88884938e-01, -3.99599999e-01,  6.10926151e-01, -1.96166858e-02,
        -3.25043440e-01, -1.12365603e-01,  1.06496856e-01,  2.53619999e-01,
        -2.63613611e-01, -3.63790691e-01, -1.38799101e-01,  1.70093283e-01,
        -4.28090155e-01, -2.73021758e-01, -1.38970837e-01, -1.27028808e-01,
        -1.41158760e-01, -2.16053516e-01,  2.51070969e-02,  1.51390601e-02,
        -2.77337849e-01,  1.95866842e-02, -8.23661745e-01,  2.70813584e-01,
        -8.9

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarity matrix
similarity_matrix = cosine_similarity(sentence_embeddings)

# Generate the summary
num_sentences = 7
summary_sentences = []
for i in range(num_sentences):
    sentence_scores = list(enumerate(similarity_matrix[i]))
    
    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    summary_sentences.append(sentences[sentence_scores[1][0]])

summary = ' '.join(summary_sentences)
print(summary)  

Sentiment Analysis — Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information. Text Classification —Text Classification is the task of assigning predefined categories to free-text documents. Sentiment Analysis — Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information. Text Summarization —Text Summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. It can provide conceptual views of document collections and has important applications in the real world. For example, an application of text classification is spam filtering in email. T

In [55]:
from transformers import BertTokenizer, BertForSequenceClassification

# Input text
input_text = """
Machine Translation — Machine translation, sometimes referred to by the abbreviation MT, is a sub-field of computational linguistics that investigates the use of software to translate text or speech from one language to another. Google’s language translator is a good example that uses neural machine translation to translate text in one language to every possible language!\n
Text Summarization —Text Summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n
Sentiment Analysis — Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\n
Text Classification —Text Classification is the task of assigning predefined categories to free-text documents. It can provide conceptual views of document collections and has important applications in the real world. Text classification helps to assign predefined categories to a document to help you find the information you need or simplify some activities. For example, an application of text classification is spam filtering in email.\n
"""

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize the input text
inputs = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True,
    return_tensors="pt",
    max_length=512,  # Adjust this value based on your input text length
    truncation=True
)

# Get the model output
with torch.no_grad():
    outputs = model(**inputs)

# Extract the logits (scores for each class)
logits = outputs.logits

# Convert logits to probabilities
import torch.nn.functional as F
probs = F.softmax(logits, dim=-1)

# Calculate the importance score for each sentence
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(input_text)
num_sentences = len(sentences)

sentence_scores = []
for i in range(num_sentences):
    score = probs[0, i].item()
    sentence_scores.append((i, sentences[i], score))

# Sort sentences based on importance score (highest to lowest)
sentence_scores = sorted(sentence_scores, key=lambda x: x[2], reverse=True)

# Set the number of sentences you want to include in the summary
num_summary_sentences = 2
summary_sentences = [sentence for _, sentence, _ in sentence_scores[:num_summary_sentences]]

# Create the summary by joining the selected sentences
summary = " ".join(summary_sentences)

print("Summary:")
print(summary)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package punkt to /home/yhbedoya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: expected string or bytes-like object