In [6]:

#  Below, we are comparing and contrasting two types of Large Language Models (LLMs):
#  GPT2LMHeadModel
#  BertForTokenClassification

#  GPT2LMHeadModel:

#  Purpose:
#  GPT2LMHeadModel is primarily used for autoregressive language modeling tasks, such as text generation, 
#  text completion, and next-token prediction.
#  It predicts the likelihood of the next token in a sequence given the preceding tokens.

#  Architecture:
#  GPT2LMHeadModel is based on the Transformer architecture, specifically the decoder part of the Transformer.
#  It consists of multiple layers of self-attention mechanisms and feed-forward neural networks.

#  Output:
#  The output of GPT2LMHeadModel is a probability distribution over the vocabulary, indicating the 
#  likelihood of each token being the next token in the sequence.

#  Example Applications:
#  Text generation
#  Dialog systems
#  Language translation (in combination with an encoder-decoder architecture)


#  BertForTokenClassification:

#  Purpose:
#  BertForTokenClassification is designed for token-level classification tasks, such as Named Entity 
#  Recognition (NER), part-of-speech tagging, and semantic role labeling.
#  It assigns a label to each token in the input sequence.

#  Architecture:
#  BertForTokenClassification is based on the Transformer architecture, specifically the encoder part 
#  of the Transformer.
#  It uses a token-level classification head on top of the pre-trained BERT encoder.

#  Output:
#  The output of BertForTokenClassification is a sequence of logits, one for each token in the input 
#  sequence, indicating the probability of each token belonging to each class.

#  Example Applications:
#  Named Entity Recognition (NER)
#  Part-of-speech tagging
#  Semantic role labeling


In [7]:

# GPT2LMHeadModel stands for "Generative Pre-trained Transformer 2 Language Model with a Language 
# Modeling Head."

# Generative: This refers to the ability of the model to generate new text based on a given prompt or context.
 
# Pre-trained: The model is pre-trained on a large corpus of text data before being fine-tuned on a 
# specific task. 

# Transformer: Transformers are a type of neural network architecture that relies on self-attention 
# mechanisms to capture long-range dependencies in sequences more effectively than traditional recurrent 
# neural networks (RNNs) or convolutional neural networks (CNNs).

# 2: GPT-2 is the second version of the Generative Pre-trained Transformer model developed by OpenAI.

# Language Model: GPT-2 is primarily a language model, which means it is trained to predict the 
# likelihood of a word or token occurring in a sequence given the context of the preceding words. 

# LMHeadModel: the LMHeadModel consists of a linear transformation followed by a softmax activation 
# function, which generates probability distributions over the vocabulary space to predict the next 
# token in a sequence.


In [8]:

from faker import Faker
import random

# Initialize Faker to generate fake data
faker = Faker()

# Define the number of samples to generate
num_samples = 1000

# Generate synthetic data for text generation, summarization, and sentiment analysis
text_data = [faker.text(max_nb_chars=random.randint(100, 500)) for _ in range(num_samples)]
summary_data = [faker.sentence() for _ in range(num_samples)]
sentiment_data = [random.choice(['positive', 'negative', 'neutral']) for _ in range(num_samples)]

# Save synthetic data to files
with open('C:\\Users\\ryan_\\text_generation_data.txt', 'w') as file:
    file.write('\n'.join(text_data))

with open('C:\\Users\\ryan_\\summarization_data.txt', 'w') as file:
    file.write('\n'.join(summary_data))

with open('C:\\Users\\ryan_\\sentiment_analysis_data.txt', 'w') as file:
    file.write('\n'.join(sentiment_data))


In [9]:

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define training dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="C:\\Users\\ryan_\\text_generation_data.txt",  # Path to your training data file
    block_size=128  # Adjust according to your dataset and memory constraints
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="C:\\Users\\ryan_\\",
    overwrite_output_dir=True,
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per GPU/CPU
    save_steps=2000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir="C:\\Users\\ryan_\\",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)


# Fine-tune the model
trainer.train()


# Save the fine-tuned model
model.save_pretrained("C:\\Users\\ryan_\\gpt2-finetuned")
tokenizer.save_pretrained("C:\\Users\\ryan_\\gpt2-finetuned")


# Finally...
from transformers import pipeline

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("C:\\Users\\ryan_\\gpt2-finetuned")

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=tokenizer)


# Example usage
generated_text = text_generator("I am hungry", max_length=50, num_return_sequences=1)

print(generated_text)


Step,Training Loss


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I am hungry when.\nSo far you.\nDoctor today will.\nWear today my.\nValue she we.\nStand away us your.\nSeal rate us.\nTreat someone tonight.\nFight off for choice.'}]


In [None]:

# [{'generated_text': 'I am hungry in whom. Bring along of whom.\nWorry worker thus increase land. Build 
# away throughout purpose.\nKnow evidence upon.\nDoctor claim action throughout. Hear analysis recognize 
# surface. Eat surface truth give.\nDoctor analysis prove.'}]


# This output is what the fine-tuned GPT-2 model generated based on the prompt "I am hungry" and 
# its own learned patterns from the training data.


In [5]:

# Result:

# The prompt was "I am hungry", and the model generated a sequence of text following that prompt.

# Explanation:
# The text generation process involves the model predicting the next word or token based on the input prompt and the 
# context learned during training.

# The model generates text sequentially, word by word, based on the probability distribution of the next token given the 
# preceding context.

# The output is a single sequence of generated text.

# The number of sequences returned (num_return_sequences) was set to 1, so there is one generated sequence.

# The maximum length of the generated text (max_length) was set to 50 tokens, limiting the length of the generated text.


In [None]:

# BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based language model 
# architecture. It consists of multiple layers of self-attention and feed-forward neural networks. 
# BERT utilizes a bidirectional approach to capture contextual information from preceding and following 
# words in a sentence.

# ForSequenceClassification: Sequence classification refers to tasks where the goal is to classify an 
# entire sequence of tokens (such as a sentence or paragraph) into one or more predefined categories or 
# labels. Examples of sequence classification tasks include sentiment analysis, text categorization, natural 
# language inference, and more.


In [2]:

import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder


# Preprocess the data
df = pd.read_csv("C:\\Users\\ryan_\\Desktop\\sentiment_analysis.csv", encoding='latin-1')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Convert df.text.values to a list of strings
texts = [str(text) for text in df.text.values]

# Encode the data
encoded_data = tokenizer.batch_encode_plus(texts, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=256, return_tensors='pt')

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']


# Initialize label encoder
label_encoder = LabelEncoder()

# Encode string labels to numerical labels
df['encoded_sentiment'] = label_encoder.fit_transform(df['sentiment'])

# Convert numerical labels to tensor
labels = torch.tensor(df['encoded_sentiment'].values)


# Load the pre-trained BERT model
# num_labels=3: positive, neutral, negative
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, output_attentions=False, output_hidden_states=False)

# Define the training parameters
batch_size = 24
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)


# Train the model
for epoch in range(epochs):
    model.train()
    for i in range(0, input_ids.size(0), batch_size):
        optimizer.zero_grad()
        outputs = model(input_ids[i:i+batch_size], attention_mask=attention_masks[i:i+batch_size], labels=labels[i:i+batch_size].long())  # Convert labels to torch.long
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)
    predictions = torch.argmax(outputs[0], dim=1).flatten()
    accuracy = torch.sum(predictions == labels) / len(labels)

print("Accuracy:", accuracy.item())


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.8629999756813049


In [4]:

# Preprocess new text data
new_text = "This is a great product! I love it."
encoded_new_data = tokenizer(new_text, return_tensors='pt')

# Pass data through the model
model.eval()
with torch.no_grad():
    outputs = model(**encoded_new_data)
    logits = outputs.logits

# Post-process predictions
predicted_class = torch.argmax(logits, dim=1).item()
predicted_sentiment = label_encoder.inverse_transform([predicted_class])[0]

print("Predicted Sentiment:", predicted_sentiment)


Predicted Sentiment: positive


In [5]:

from transformers import pipeline

# Load pre-trained GPT model and tokenizer
generator = pipeline("text-generation", model="gpt2")

# Generate text
generated_text = generator("Once upon a time")

print(generated_text)





Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Once upon a time, I became very comfortable as a person of nature. I can see what's happening in my body, on the ice. When I was in hospital and I was diagnosed with cancer, the first treatments were to try to kill you"}]


In [None]:

# Result:

# The result is a generated text based on the input prompt "Once upon a time" using a pre-trained GPT (Generative Pre-trained 
# Transformer) model.

# Interpretation:

# The generated text starts with the common phrase "Once upon a time," which often indicates the beginning of a story.

# The continuation of the text appears to describe a personal experience or reflection. It mentions becoming comfortable as 
# a person of nature and observing one's body.

# There is a sudden shift in the narrative when it mentions being in the hospital and being diagnosed with cancer. The text 
# describes the initial treatments aimed at addressing the diagnosis.

# Context:
# The context and coherence of the generated text may seem fragmented or nonsensical. This is because the GPT model generates 
# text based on patterns it learned from a large corpus of text data. While it can produce coherent and contextually 
# relevant text, it may also produce unexpected or unrelated sequences, especially when prompted with short or ambiguous input.

# Evaluation:
# The quality of generated text can vary based on factors such as the specific prompt provided, the complexity of the 
# language model, and the intended use case.

# In this case, the generated text appears to include elements of storytelling and personal reflection, but it may not 
# necessarily align with coherent storytelling or factual accuracy.
    

In [1]:

#  Similarities between GPT2LMHeadModel and BertForTokenClassification:
#  Both based on Transformer: Both GPT2LMHeadModel and BertForTokenClassification are based on the 
#  Transformer architecture, which is known for its effectiveness in modeling sequential data.

#  Pre-trained Models: Both models are typically pre-trained on large corpora of text data using 
#  self-supervised learning objectives before being fine-tuned on downstream tasks.

#  Hugging Face Transformers Library: Both models are part of the Transformers library developed by 
#  Hugging Face, providing a consistent interface for working with a wide range of transformer-based models.

#  Differences between GPT2LMHeadModel and BertForTokenClassification:
#  Task: The primary difference between the two models is the task they are designed for. GPT2LMHeadModel 
#  is used for autoregressive language modeling, while BertForTokenClassification is used for token-level 
#  classification tasks.

#  Output: GPT2LMHeadModel outputs a probability distribution over the vocabulary for generating the next 
#  token, whereas BertForTokenClassification outputs a sequence of logits for classifying each token.

#  Architecture Focus: While both models are based on the Transformer architecture, GPT2LMHeadModel 
#  focuses on the decoder part of the Transformer, whereas BertForTokenClassification focuses on the 
#  encoder part.

#  In summary, GPT2LMHeadModel and BertForTokenClassification are both powerful models based on the 
#  Transformer architecture, but they are designed for different tasks and have different output formats. 
#  GPT2LMHeadModel is used for autoregressive language modeling, while BertForTokenClassification is 
#  used for token-level classification tasks such as Named Entity Recognition (NER). NEW is a natural 
#  language processing (NLP) task that involves identifying and classifying named entities in text into 
#  predefined categories such as the names of persons, organizations, locations, dates, quantities, 
#  monetary values, etc.


In [3]:

# NOTICE: Both GPT2LMHeadModel and BertForTokenClassification can be relatively slow during inference, 
# especially when using large models or processing long sequences of text. This is because these models 
# are computationally intensive and require significant computational resources to generate predictions 
# or classify tokens accurately.

# To improve the speed of inference or training, these models can be run on specialized hardware 
# accelerators such as GPUs (Graphics Processing Units) or TPUs (Tensor Processing Units). GPUs and TPUs 
# are optimized for parallel processing tasks like deep learning, allowing them to perform computations 
# much faster than traditional CPUs (Central Processing Units).


In [None]:

# END...
