In [1]:
import pandas as pd
import transformers
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the data into a Pandas DataFrame
file_path = "filtered_final.csv"  # Update this path if necessary
train_df_doc = pd.read_csv(file_path) #read the data
train_df_doc = train_df_doc.dropna() #drop null values in columns
train_df_doc = train_df_doc.sample(frac=1).reset_index(drop=True) #shuffle the dataset
train_df_doc = train_df_doc.head(100000) #limit the dataset to 100'000
# train_df_doc = train_df_doc.head(100) #limit the dataset to 100'000

In [3]:
print(f"Number of rows in training set with docstring: {len(train_df_doc)}")

Number of rows in training set with docstring: 100000


In [4]:
# Combine docstring and function signature as input
train_df_doc['input_text_doc'] = '<|im_start|>' + '"""' + train_df_doc['docstring'] + '\n"""' + '\n' + train_df_doc['function_signature'] + '\n' + train_df_doc['function_body'] + '<|im_end|>'
train_df_doc['input_text'] = '<|im_start|>' + train_df_doc['function_signature'] + '\n' + train_df_doc['function_body'] + '<|im_end|>'

# # Split into training, evaluation, and test sets
trainingSetDocInputs = train_df_doc['input_text'].tolist()
trainingSetInputs = train_df_doc['input_text_doc'].tolist()


In [5]:
def makeDataset(input_text, tokenizer):
    # Set padding token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    max_length = 1024  # Define a reasonable max_length for inputs and targets

    # Tokenize the input and target texts
    input_encodings = tokenizer(input_text, padding='max_length', truncation=True, return_tensors='pt', max_length=max_length)
    input_encodings['labels'] = input_encodings['input_ids'].clone()
    
    return Dataset.from_dict({
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': input_encodings['labels']
    })

# 135M

In [6]:
#without docstring
model_name = 'HuggingFaceTB/SmolLM-135M'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSet = makeDataset(trainingSetInputs, tokenizer)
trainingSet.save_to_disk("./datasets/135MTrainSet")


Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:01<00:00, 93305.56 examples/s]


In [7]:
#with docstring
model_name = 'HuggingFaceTB/SmolLM-135M'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSetDoc = makeDataset(trainingSetDocInputs, tokenizer)
trainingSetDoc.save_to_disk("./datasets/135MDocTrainSet")

Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:00<00:00, 119261.02 examples/s]


# 360M

In [8]:
#without docstring
model_name = 'HuggingFaceTB/SmolLM-360M'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSet = makeDataset(trainingSetInputs, tokenizer)
trainingSet.save_to_disk("./datasets/360MTrainSet")

Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:01<00:00, 92081.74 examples/s]


In [9]:
#with docstring
model_name = 'HuggingFaceTB/SmolLM-360M'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSetDoc = makeDataset(trainingSetDocInputs, tokenizer)
trainingSetDoc.save_to_disk("./datasets/360MDocTrainSet")

Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:00<00:00, 122069.38 examples/s]


# 1.7B

In [10]:
#without docstring
model_name = 'HuggingFaceTB/SmolLM-1.7B'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSet = makeDataset(trainingSetInputs, tokenizer)
trainingSet.save_to_disk("./datasets/1.7BTrainSet")

Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:01<00:00, 59057.79 examples/s]


In [11]:
#with docstring
model_name = 'HuggingFaceTB/SmolLM-1.7B'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
trainingSetDoc = makeDataset(trainingSetDocInputs, tokenizer)
trainingSetDoc.save_to_disk("./datasets/1.7BDocTrainSet")

Saving the dataset (3/3 shards): 100%|██████████| 100000/100000 [00:01<00:00, 79562.96 examples/s]
