In [3]:
import zipfile
import os

def extract_zip(zip_path, extract_to='.'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
zip_path = '/content/PDF Files.zip'
extract_to = '/content/extracted_files'
extract_zip(zip_path, extract_to)

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/232.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2
import os

def read_file(file_path):
    text = ''
    if file_path.endswith('.pdf'):
        # Process PDF files
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in range(len(reader.pages)):
                text += reader.pages[page].extract_text()
    elif file_path.endswith('.txt'):
        # Process text files
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    return text

def load_text_from_extracted_files(extracted_to):
    all_texts = []
    for root, dirs, files in os.walk(extracted_to):
        for file in files:
            file_path = os.path.join(root, file)
            all_texts.append(read_file(file_path))  # Append each file's text as a separate entry
    return all_texts

# 3. Process the ZIP and load text
zip_path = '/content/PDF Files.zip'
extract_to = '/content/extracted_files'
extract_zip(zip_path, extract_to)
all_texts = load_text_from_extracted_files(extract_to)

# Read all files from extracted directory
extracted_to = '/content/extracted_files'
all_text = ''
for root, dirs, files in os.walk(extracted_to):
    for file in files:
        file_path = os.path.join(root, file)
        all_text += read_file(file_path)

In [None]:
!pip install datasets

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# 5. Tokenize the dataset
data = {'text': all_texts}
dataset = Dataset.from_dict(data)

# 5. Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 6. Set format for training
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Define the model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Update to "eval_strategy" in future versions if needed
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=200,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Before training, print a sample from the dataset for verification
print(tokenized_dataset[0])


In [None]:
model.save_pretrained('./fine-tuned-distilgpt')
tokenizer.save_pretrained('./fine-tuned-distilgpt')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned DistilGPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-distilgpt')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-distilgpt')

In [None]:
input_text = "What are the standards that talk about EMC?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate text
output = model.generate(input_ids, max_length=5000000, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)