In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoTokenizer
import torch

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on: {device}")

# Load the dataset from CSV
df = pd.read_csv('association_dataset.csv')

# Combine all text data into a single column for fine-tuning
text_data = df['Content'].tolist()

# Convert text data into a dataset format suitable for transformers
dataset = Dataset.from_dict({"text": text_data})

# Load the GPT-2 tokenizer and model
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

# Add padding token if needed (GPT-2 does not have a padding token by default)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
    output_dir='./model',
    overwrite_output_dir=True,
    no_cuda=(device == "cpu"),  # Set this to True if CUDA is not available
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')


In [15]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the tokenizer and model from the directory where you saved them
model_dir = "D:/stud bot/model"  # Replace with the path to the downloaded model folder

tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained(model_dir)

# Generate text to evaluate the model
input_text = "What are the main goals of the student association?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate text with the model
outputs = model.generate(
    inputs['input_ids'],
    max_length=100,
    num_return_sequences=1,
    do_sample=True, 
    top_k=50,
    top_p=0.95
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What are the main goals of the student association?


Our main objectives are to:

Recruit top students from diverse disciplines within the community. We will ensure that our students and instructors are willing to mentor and mentor other students.


Develop new initiatives and policies aimed at improving student success.


Develop an impact statement regarding the specific objectives of the student association.


Work with and participate in local leadership and other community initiatives.


Work with local, state and federal law
