In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import json
from datasets import load_dataset

In [5]:
def gopher_rules_pass(sample) -> bool:
    """ function returns True if the sample complies with Gopher rules """
    signals = json.loads(sample["quality_signals"])

    # rule 1: number of words between 50 and 10'000
    word_count = signals["rps_doc_word_count"][0][2]
    if word_count < 50 or word_count > 10_000:
        return False

    # rule 2: mean word length between 3 and 10
    mean_word_length = signals["rps_doc_mean_word_length"][0][2]
    if mean_word_length < 3 or mean_word_length > 10:
        return False

    # rule 2: symbol to word ratio below 0.1
    symbol_word_ratio = signals["rps_doc_symbol_to_word_ratio"][0][2]
    if symbol_word_ratio > 0.1:
        return False

    # rule 3: 90% of lines need to start without a bullet point
    n_lines = signals["ccnet_nlines"][0][2]
    n_lines_bulletpoint_start = sum(map(lambda ln: ln[2], signals["rps_lines_start_with_bulletpoint"]))
    if n_lines_bulletpoint_start / n_lines > 0.9:
        return False

    # rule 4: the ratio between characters in the most frequent 2-gram and the total number 
    # of characters must be below 0.2
    top_2_gram_frac = signals["rps_doc_frac_chars_top_2gram"][0][2]
    if top_2_gram_frac > 0.2:
        return False

    # rule 5: ...

    return True

In [6]:


ds_iterator = load_dataset("togethercomputer/RedPajama-Data-V2",
                  name="default",
                  partition="head_middle",
                  #snapshots=["2023-06", "2022-49"],
                  languages=["en", "de", "fr", "es", "it"],
                  streaming=True,) 

for sample in ds_iterator["train"]:

    if not gopher_rules_pass(sample):
        continue

    documents = json.loads(sample["documents"])

    print(documents)
    break
    

NameError: name 'json' is not defined

In [10]:
print(sample["raw_content"])

ABOUT AWB
KIDS ARE KIDS
JOIN THE CAST
<< Back to AWB News
Christine Rouse is honored on the “Today Show”
The executive director of Acting Without Boundaries (AWB), Christine Rouse, was featured on the NBC Today Show with “Kathie Lee and Hoda” on March 1, 2012. The monthly segment, called “Everyone Has A Story,” features one ordinary person that has had a life-changing experience in their own life. Christine submitted an essay describing her life’s mission of increasing awareness of and support for people with disabilities. She described the process of creating the two non-profits she manages – “Kids are Kids,” which provides disability awareness workshops and AWB which provides theater arts opportunities for children, youth and young adults with physical disabilities . Christine talked about the importance of both in increasing inclusion for people, especially young people, with physical disabilities.
The March “Everyone Has A Story” segment featured Christine, her mother, and her brot

In [None]:
class Distiller:
    def __init__(self, params student_model, teacher_model, dataset):
        self.alpha_ce = params.get("alpha_ce", 0.5)
        self.alpha_mlm = params.get("alpha_mlm", 0.0)
        self.alpha_clm = params.get("alpha_clm", 0.5)
        self.alpha_mse = params.get("alpha_mse", 0.0)
        self.alpha_cos = params.alpha_cos

        self.temperature = params.get("temperature", 2.0)

        self.mlm_mask_prob = params.get("mask_prob", 0.15)
        self.word_rand = params.get("word_rand", 0.1)
        self.word_keep = params.get("word_keep", 0.1)
        self.word_mask = params.get("word_mask", 0.8)
        assert self.word_rand + self.word_keep + self.word_mask == 1.0

        self.n_epoch = params.n_epoch
        self.batch_size = params.batch_size
        self.gradient_accumulation_steps = params.get("gradient_accumulation_steps", 50)

        self.warmup_prop = params.get("warmup_prop", 0.05)
        self.weight_decay = params.get("weight_decay", 0.0)
        self.learning_rate = params.get("learning_rate", 5e-4)
        self.adam_epsilon = params.get("adam_epsilon", 1e-6)
        self.max_grad_norm = params.get("max_grad_norm", 5.0)
        self.initializer_range = params.get("initializer_range", 0.02)


        self.student_model = student_model
        self.teacher_model = teacher_model

        self.dataset = dataset





In [None]:
def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            #Soften the student logits by applying softmax first and log() second
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

# Apply ``train_knowledge_distillation`` with a temperature of 2. Arbitrarily set the weights to 0.75 for CE and 0.25 for distillation loss.
train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, train_loader=train_loader, epochs=10, learning_rate=0.001, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device)
test_accuracy_light_ce_and_kd = test(new_nn_light, test_loader, device)

# Compare the student test accuracy with and without the teacher, after distillation
print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")
