<a href="https://colab.research.google.com/github/agupta-123/amank/blob/main/PlagiarismCheckerAI1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:

# Create dataset.txt in the current working directory
with open("dataset.txt", "w") as f:
    f.write("""The cat sleeps on the mat.
    The dog runs in the park.
The bird sings a sweet song.
The sun shines brightly today.
The moon glows at night.
The rain falls gently on the ground.
The wind blows through the trees.
The flowers bloom in spring.
The stars twinkle in the sky.
The river flows quietly downstream.""")

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load tokenizer and model (using a small GPT-2 for this example)
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Load dataset (replace 'dataset.txt' with your dataset file)
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="dataset.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./slm_output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Generate output
input_text = "Example prompt: "
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



ValueError: num_samples should be a positive integer value, but got num_samples=0

In [20]:
with open("dataset.txt", "w") as f:
    f.write("""The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
The rain falls gently on the ground below.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
The rain falls gently on the ground below.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.""")

In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import os

# Print the current working directory to verify
print("Current working directory:", os.getcwd())

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Set pad_token to eos_token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Load dataset using the Datasets library
dataset = load_dataset("text", data_files={"train": "dataset.txt"})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Prepare dataset for training
train_dataset = tokenized_dataset["train"]

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set training arguments with more epochs
training_args = TrainingArguments(
    output_dir="./slm_output",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Ensure 5 epochs
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Generate output with improved parameters
input_text = "The sun shines"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=32)
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Current working directory: /content


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Step,Training Loss


Generated text: The sun shines today in the sky.












































In [5]:
with open("dataset1.txt", "w") as f:
    f.write("""The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
The rain falls gently on the ground below.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The sun shines brightly today in the sky.
The moon glows at night with a soft light.
The rain falls gently on the ground below.
The wind blows through the trees with force.
The flowers bloom in spring with vibrant colors.
The stars twinkle in the sky on clear nights.
The river flows quietly downstream through the valley.
A child plays with a ball in the yard.
The teacher writes on the board during class.
The car drives down the road at high speed.
The airplane flies above the clouds in the sky.
The fish swims in the ocean with other sea creatures.
The cat sleeps on the mat every morning.
The dog runs in the park with joy.
The bird sings a sweet song at dawn.""")

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import os

# Print the current working directory to verify
print("Current working directory:", os.getcwd())

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Set pad_token to eos_token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Load dataset using the Datasets library
dataset = load_dataset("text", data_files={"train": "dataset1.txt"})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Prepare dataset for training
train_dataset = tokenized_dataset["train"]

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set training arguments with more epochs
training_args = TrainingArguments(
    output_dir="./slm_output",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Ensure 5 epochs
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Generate output with improved parameters
input_text = "The sun set"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=32)
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)
print("Generated text:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Current working directory: /content


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maman-gupta-7238[0m ([33maman-gupta-7238-rbmi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Generated text: The sun set in the sky on clear nights.










































In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh