# Importing Necessary Libraries

In [1]:
!pip install transformers datasets



In [2]:
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline
import warnings
warnings.filterwarnings("ignore")

# Preparing the Data

In [3]:
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)),
        "validation": ds_valid.shuffle().select(range(1000))
    }
)

codeparrot-ds-train.jsonl:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

codeparrot-ds-valid.jsonl:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/3322 [00:00<?, ? examples/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 1000
    })
})

In [5]:
raw_datasets["train"][0]

{'repo_name': 'procoder317/scikit-learn',
 'path': 'examples/svm/plot_separating_hyperplane.py',
 'copies': '294',
 'size': '1273',
 'license': 'bsd-3-clause'}

# Preprocessing

In [6]:
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [7]:
context_length = 128

def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [8]:
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1354946
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 29767
    })
})

In [10]:
tokenized_datasets["train"][0]

{'input_ids': [280,
  173,
  6829,
  1750,
  29,
  173,
  46575,
  26,
  7183,
  9047,
  27225,
  41476,
  173,
  6829,
  1750,
  29,
  173,
  173,
  7465,
  256,
  2759,
  9047,
  27225,
  41476,
  2709,
  231,
  2030,
  13,
  692,
  173,
  4559,
  463,
  1918,
  1312,
  231,
  14387,
  10717,
  22186,
  11459,
  461,
  173,
  5349,
  4044,
  14,
  173,
  280,
  173,
  1647,
  4508,
  1276,
  3485,
  173,
  173,
  2745,
  1601,
  442,
  635,
  173,
  2745,
  4855,
  14,
  11032,
  442,
  2564,
  173,
  973,
  15673,
  978,
  36697,
  173,
  173,
  3,
  649,
  969,
  8102,
  2756,
  463,
  2157,
  173,
  1075,
  14,
  2437,
  14,
  4544,
  8,
  16,
  9,
  173,
  56,
  233,
  635,
  14,
  82,
  7285,
  1075,
  14,
  2437,
  14,
  16670,
  8,
  2368,
  12,
  554,
  9,
  415,
  404,
  18,
  12,
  554,
  547,
  635,
  14,
  2437,
  14,
  16670,
  8,
  2368,
  12,
  554,
  9,
  382,
  404,
  18,
  12,
  554,
  1730,
  173,
  57]}

# Initializing and Training a new Model

In [11]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [12]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [13]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [14]:
model_name = "gpt2-clm-from-scratch"

In [15]:
args = TrainingArguments(
    output_dir=f"./{model_name}",
    report_to="none",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [16]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2646, training_loss=2.8212191235000943, metrics={'train_runtime': 41536.5358, 'train_samples_per_second': 32.621, 'train_steps_per_second': 0.064, 'total_flos': 8.8496475734016e+16, 'train_loss': 2.8212191235000943, 'epoch': 0.9998110712261478})

In [17]:
trainer.save_model(f"./{model_name}")

# Testing the Model

In [18]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [19]:
pipe = pipeline(
    "text-generation", model=f"./{model_name}", device=device
)

In [20]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
x, y = np.mean(x, 0), np


In [21]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
df = pd.DataFrame(y, columns=['X', 'Y


In [22]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
# and the mean income


In [23]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])


# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:

forest = RandomForestRegressor(n_estimators=100, n_
