### Importing Libraies


In [1]:
# Importing the Dataset class from the datasets library
from datasets import Dataset

# Importing the DataCollatorWithPadding class from the transformers library
from transformers import DataCollatorWithPadding

# Importing the Trainer and TrainingArguments classes, GPT2LMHeadModel and GPT2Tokenizer from the transformers library
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

# Importing the PorterStemmer class from the nltk.stem module
from nltk.stem import PorterStemmer

# Importing the string module
import string

# Importing the torch library for tensor computations
import torch

# Importing the pandas library for data manipulation
import pandas as pd

# Importing the SnowballStemmer class from the nltk.stem module
from nltk.stem import SnowballStemmer

# Importing the spacy library for advanced NLP tasks
import spacy

# Importing the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Importing the re module for regular expression operations
import re


import spacy


import gc

# Importing the warnings module to handle warnings
import warnings

# Ignoring any warnings that might be generated when running the code
warnings.filterwarnings('ignore')


2024-03-08 14:54:04.371814: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 14:54:04.371896: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 14:54:04.373531: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Combining Data In one DF

In [2]:

def load_and_preprocess_data(source_file, target_file):
    # Load the data
    with open(source_file, 'r') as f:
        source = f.read().splitlines()

    with open(target_file, 'r') as f:
        target = f.read().splitlines()

    # Combine the data into one DataFrame
    df = pd.DataFrame({
        'source': source,
        'target': target
    })

    # Extract tags and create a new column for them
    df['tag'] = df['source'].apply(lambda x: re.findall(r'\[(.*?)\]', x))

    # Remove the tags from the 'source' column
    df['source'] = df['source'].apply(lambda x: re.sub(r'\[(.*?)\]', '', x))

    # Remove any rows with missing values
    df = df.dropna()
    
    return df


# Load and preprocess the data
df=load_and_preprocess_data('/kaggle/input/writing-prompts/writingPrompts/valid.wp_source','/kaggle/input/writing-prompts/writingPrompts/valid.wp_target')



In [3]:
df

Unnamed: 0,source,target,tag
0,Every person in the world undergoes a `` good...,"Clancy Marguerian , 154 , private first class ...",[ WP ]
1,Space mining is on the rise . The Space tanke...,„… and the little duckling will never be able ...,[ WP ]
2,`` I wo n't have time to explain all of this ...,I wo n't have the time to explain all of this ...,[ WP ]
3,Write about a song . Each sentence must start...,* '' [ Sally ] ( https : //www.youtube.com/wat...,[ CW ]
4,You live in Skyrim . It is your job to keep l...,Light is a marvelous thing . It alone can turn...,[ EU ]
...,...,...,...
15615,You are a teenager with the ability to measur...,I decided to go with a 1-15 scale instead of 1...,[ WP ]
15616,"As your dying wish , you ask that your body i...",The shock hit me hard as my lungs filled with ...,[ WP ]
15617,A young child stumbles upon a serial killer d...,`` Your mommy and daddy did n't raise you righ...,[ WP ]
15618,Write from the perspective of a dog who think...,She wants me to get into the car . It 's just ...,[ WP ]


### Text Normalization: This includes converting all text to lower case, which can help ensure that your algorithm does not treat the same words in different cases as different words.

In [4]:
df['source'] = df['source'].str.lower()
df['target'] = df['target'].str.lower()


In [5]:
df['target'] = df['target'].str.replace('„', '')
df['target'] = df['target'].str.replace('”', '')
df['target'] = df['target'].str.replace('< newlin >', ' ')


### Removing Punctuation: Punctuation can provide less value when training language models, and removing it can reduce the size of the vocabulary your model needs to learn.

In [6]:
df['source'] = df['source'].str.translate(str.maketrans('', '', string.punctuation))
df['target'] = df['target'].str.translate(str.maketrans('', '', string.punctuation))


In [7]:
df

Unnamed: 0,source,target,tag
0,every person in the world undergoes a goodne...,clancy marguerian 154 private first class of...,[ WP ]
1,space mining is on the rise the space tanker...,… and the little duckling will never be able t...,[ WP ]
2,i wo nt have time to explain all of this to ...,i wo nt have the time to explain all of this t...,[ WP ]
3,write about a song each sentence must start ...,sally https wwwyoutubecomwatch v6qyvil0...,[ CW ]
4,you live in skyrim it is your job to keep li...,light is a marvelous thing it alone can turn ...,[ EU ]
...,...,...,...
15615,you are a teenager with the ability to measur...,i decided to go with a 115 scale instead of 11...,[ WP ]
15616,as your dying wish you ask that your body is...,the shock hit me hard as my lungs filled with ...,[ WP ]
15617,a young child stumbles upon a serial killer d...,your mommy and daddy did nt raise you right ...,[ WP ]
15618,write from the perspective of a dog who think...,she wants me to get into the car it s just so...,[ WP ]


In [8]:
df['target'] = df['target'].str.replace('newline', '')
df['source'] = df['source'].str.replace('newline', '')

### Lemmatization: These techniques are used to reduce words to their root form. This can help your model generalize better to variations of words.

In [None]:
nlp = spacy.load('en_core_web_sm')

# Apply lemmatization to each word in the 'source' and 'target' columns of your DataFrame
df['source'] = df['source'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
df['target'] = df['target'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))



In [9]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Clear GPU memory
torch.cuda.empty_cache()


In [11]:
# Force garbage collection
gc.collect()


287

In [None]:
def load_dataset(train_df, valid_df, tokenizer):
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)

    def tokenize_function(examples):
        tokenized_inputs = tokenizer(examples["source"], truncation=True, padding="max_length")
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
        return tokenized_inputs

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    valid_dataset = valid_dataset.map(tokenize_function, batched=True)

    return train_dataset, valid_dataset

training_args = TrainingArguments(
    output_dir="./distilgpt2_story_gen",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# Make sure to define train_df and valid_df before this line
train_dataset, valid_dataset = load_dataset(train_df, valid_df, tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = GPT2LMHeadModel.from_pretrained("distilgpt2")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
)

trainer.train()


  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


In [None]:
# Save the trained model
trainer.save_model("./creative_writing_distilgpt2_story_gen")

# Save the tokenizer
tokenizer.save_pretrained("./creative_writing_distilgpt2_story_gen")
