In [1]:
import os
import re

def load_and_preprocess_text(folder_path):
    """Loads and preprocesses text files from a given folder."""
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Check if it's a text file
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read()

                    # Basic preprocessing (remove extra whitespace, etc.)
                    text = re.sub(r'\s+', ' ', text).strip() #Remove extra whitespace.
                    # Add more preprocessing steps as needed (e.g., removing HTML tags, punctuation)
                    all_text += text + "\n"  # Add a newline between files
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    return all_text

folder_path = r"C:\Users\Lenovo\Downloads\End-to-end-LLM-main\End-to-end-LLM-main\annotations\web_text"
processed_text = load_and_preprocess_text(folder_path)

#Save the combined text to a file.
with open("combined_text.txt", "w", encoding = "utf-8") as outfile:
    outfile.write(processed_text)

print("Text files loaded and preprocessed.")

Text files loaded and preprocessed.


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
torch.cuda.is_available()

True

In [4]:
device

device(type='cuda')

In [None]:
print(torch.cuda.is_available())  # Should print True if CUDA is available
print(torch.cuda.get_device_name(0)) #prints the name of the cuda device.

True
NVIDIA GeForce RTX 4050 Laptop GPU


In [7]:
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM, pipeline

In [None]:
import os
import re
import torch
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM, pipeline

# 1. Data Loading and Preprocessing
def load_and_preprocess_text(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read()
                    text = re.sub(r'\s+', ' ', text).strip()
                    all_text += text + "\n"
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    return all_text

folder_path = r"C:\Users\Lenovo\Downloads\End-to-end-LLM-main\End-to-end-LLM-main\annotations\web_text"
processed_text = load_and_preprocess_text(folder_path)
with open("combined_text.txt", "w", encoding="utf-8") as outfile:
    outfile.write(processed_text)
print("Text files loaded and preprocessed.")

# 2. Model Training (Fine-tuning)
model_name = "gpt2" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="combined_text.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

model = AutoModelForCausalLM.from_pretrained(model_name)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

Text files loaded and preprocessed.




In [11]:
# Save the model locally
save_path = "./local_model"
if not os.path.exists(save_path):
    os.makedirs(save_path)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved locally at:", save_path)

Model saved locally at: ./local_model


In [None]:
import pandas as pd
import re
from transformers import pipeline
import  json

def load_test_data(csv_path):
    try:
        # Assuming there is only one column in your CSV file
        column_names = ['prompt']
        df = pd.read_csv(csv_path, names=column_names, header=None)
        return df
    except Exception as e:
        print(f"Error loading CSV: {e}")

def preprocess_test_data(df, column_name):
    try:
        df[column_name] = df[column_name].apply(lambda x: re.sub(r'\s+', ' ', str(x)).strip())
        return df
    except KeyError:
        print(f"Column '{column_name}' not found in DataFrame.")
        return None

def generate_text(model_name, test_prompts):
    generator = pipeline('text-generation', model=model_name)
    generated_texts = []
    
    for prompt in test_prompts:
        output = generator(prompt, max_length=50, num_return_sequences=1)
        generated_texts.append(output[0]['generated_text'])
    
    return generated_texts

if __name__ == "__main__":
    csv_path = "test_set.csv"
    column_name = 'prompt'
    
    # Load test data
    df = load_test_data(csv_path)
    
    # Check column names
    print("Available column names:", df.columns)
    
    # Preprocess test data
    df = preprocess_test_data(df, column_name)
    
    if df is not None:
        # Generate text
        model_name = "local_model"  # Path to your trained model
        test_prompts = df[column_name].tolist()
        generated_texts = generate_text(model_name, test_prompts)
        a = {}
        for prompt, generated_text in zip(test_prompts, generated_texts):
            a[prompt] = generated_text
            print(f"Prompt: {prompt}")
            print(f"Generated Text: {generated_text}\n")
        with open('data.json', 'w') as file:
            json.dump(a, file, indent=4)
    else:
        print("Preprocessing failed. Please check column names.")


Device set to use cuda:0


Available column names: Index(['prompt'], dtype='object')


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Prompt: Where is the Family Nature Hike held?
Generated Text: Where is the Family Nature Hike held? A: The Family Nature Hike is an annual tradition in Pittsburgh where participants run a race through Schenley Park, featuring diverse heats for visitors to enjoy various outdoor adventures. Q: What are the weather

Prompt: Which festival celebrates Latin-American culture in Pittsburgh?
Generated Text: Which festival celebrates Latin-American culture in Pittsburgh? A: The Pittsburgh Picklesburgh festival, which takes place from September 30 to October 2, 2024, showcases pickle-themed beverages, food, and crafts. Q: What is Pickle Juice

Prompt: What public art project by the Pittsburgh Cultural Trust is named after a sudden meteorological event?
Generated Text: What public art project by the Pittsburgh Cultural Trust is named after a sudden meteorological event? A: The Pittsburgh Cultural Trust is named after Carnegie Museum of Natural History member Dr. Evgeni St. Pierre (1883-1912) who 