<a href="https://colab.research.google.com/github/Yash-invic/Architecting-LLMs-WiDS/blob/main/week2/week2_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#E01

In [None]:
# the initial setup
# I have used the names.txt file of Andrew Karpathy for the trigram model training and making
# yes, the trigram model improves significantly ove the bigram model
import torch
import torch.nn.functional as F

words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.']=0
itos = {i: s for s, i in stoi.items()}

xs, ys = [], []
for w in words:
    chs = ['.', '.'] +list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        input_id = ix1*27+ix2
        xs.append(input_id)
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f'Dataset built. {num} examples')
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [None]:
# actual model
for l in range(1000):
    # forward pass
    xenc = F.one_hot(xs, num_classes=729).float()
    logits = xenc@W
    # can use simply logits = W[xs]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims = True)
    # loss(nll)
    loss = -probs[torch.arange(num), ys].log().mean()
    # backward pass
    W.grad = None
    loss.backward()
    W.data += -50*W.grad
print(f'Step {l}: Loss is {loss.item()}')

In [None]:
# generating names
g_sample = torch.Generator().manual_seed(2147483647)
for i in range(10):
    out = []
    ix = 0
    context_idx = 0
    while True:
        logits = W[context_idx]
        probs = F.softmax(logits, dim=0)
        ix_next = torch.multinomial(probs, num_samples=1, generator = g_sample).item()
        if ix_next == 0:
            break
        out.append(itos[ix_next])
        context_idx = (context_idx) % 27 *27 + ix_next
    print(''.join(out))

#E02

In [None]:
# when we split the data, we observe the generalization gap
# the loss of both the training and the dev appear to be similar, with the loss for dev being really slightly greator than training
# the test loss is close to the cev loss
import random
def build_dataset(words_list):
    X, Y = [], []
    for w in words_list:
        chs = ['.', '.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            X.append(ix1*27+ix2)
            Y.append(ix3)
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(f'Dataset shape: {X.shape}')
    return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
print("Training Set:")
Xtr, Ytr = build_dataset(words[:n1])
print("Dev(Validation) Set:")
Xdev, Ydev = build_dataset(words[n1:n2])
print("Test Set:")
Xte, Yte = build_dataset(words[n2:])

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g)
W.requires_grad = True
for k in range(1000):
  logits = W[Xtr]
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(len(Xtr)), Ytr].log().mean()
  W.grad = None
  loss.backward()
  W.data += -50 * W.grad
print(f'Final training loss (Xtr): {loss.item()}')
with torch .no_grad():
    logits = W[Xdev]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims = True)
    dev_loss = -probs[torch.arange(len(Xdev)), Ydev].log().mean()
print(f'Validation(dev) loss (Xdev) : {dev_loss.item()}')
with torch.no_grad():
    logits = W[Xte]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    test_loss = -probs[torch.arange(len(Xte)), Yte].log().mean()
print(f'Test set loss (Xte): {test_loss.item()}')

#E03

In [None]:
# As smoothing increases, train loss increases (underfitting). Dev loss goes down initially, then rises again.

regularization_strengths = [0.1, 0.01, 0.001, 0.0]
results = {}
best_loss = float('inf')
best_reg = None
best_W = None
for reg in regularization_strengths:
    g = torch.Generator().manual_seed(2147483647)
    W_temp = torch.randn((729, 27), generator=g, requires_grad=True)
    for k in range(500):
        logits = W_temp[Xtr]
        loss = F.cross_entropy(logits, Ytr) + reg * (W_temp**2).mean()
        W_temp.grad = None
        loss.backward()
        W_temp.data += -50 * W_temp.grad
    with torch.no_grad():
        logits_dev = W_temp[Xdev]
        dev_loss = F.cross_entropy(logits_dev, Ydev).item()
        results[reg] = dev_loss
        print(f"Reg Strength {reg}: Dev Loss = {dev_loss:.4f}")
        if dev_loss < best_loss:
            best_loss = dev_loss
            best_reg = reg
            best_W = W_temp.clone()
print(f"Best smoothing strength: {best_reg}")
with torch.no_grad():
    logits_test = best_W[Xte]
    test_loss = F.cross_entropy(logits_test, Yte).item()
print(f"Final test loss (Xte) using reg={best_reg}: {test_loss:.4f}")
# achieved a test loss of approx 2.26, which is very close to the Dev Loss.

#E04

In [None]:
# yes we can delete the use of F.one_hot by simply indexing the rows of W.
# the code is as follows-
for l in range(500):
    # forward pass
    # xenc = F.one_hot(xs, num_classes=729).float()
    # logits = xenc@W
    logits = W[xs]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims = True)
    # loss(nll)
    loss = -probs[torch.arange(num), ys].log().mean()
    # backward pass
    W.grad = None
    loss.backward()
    W.data += -50*W.grad
print(f'Step {l}: Loss is {loss.item()}')

#E05

In [None]:
# using F.cross_entropy
# we prefer to use the F.cross_entropyas a replacement for log liklihood
# it combines all the math done manually and directly give the final negative log liklihood
# it is simple, efficient and has the solution for the raw probability of 0
import torch.nn.functional as F

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729,27), generator =g, requires_grad = True)
for l in range(500):
    logits = W[Xtr]
    loss = F.cross_entropy(logits, Ytr) + 0.01*(W**2).mean()
    W.grad = None
    loss.backward()
    W.data += -50*W.grad
print(f'Final Loss: {loss.item()}')
g_sample = torch.Generator().manual_seed(2147483647)
for i in range(10):
    out = []
    ix = 0
    context_idx = 0
    while True:
        logits = W[context_idx]
        probs = F.softmax(logits, dim=0)
        ix_next = torch.multinomial(probs, num_samples=1, generator = g_sample).item()
        if ix_next == 0:
            break
        out.append(itos[ix_next])
        context_idx = (context_idx) % 27 *27 + ix_next
    print(''.join(out))

#E06

In [None]:
# Car name generator trigram model
import torch
import torch.nn.functional as F
car_brands = [
    'sierra', 'estate', 'sumo', 'safari', 'indica', 'indigo', 'nano', 'aria',
    'manza', 'venture', 'zest', 'bolt', 'tiago', 'tigor', 'hexa', 'nexon',
    'harrier', 'altroz', 'safarinew', 'punch', 'curvv', 'armada', 'commander',
    'marshal', 'major', 'legend', 'bolero', 'scorpio', 'thar', 'xylo', 'quanto',
    'verito', 'veritovibe', 'nuvosport', 'marazzo', 'scorpion', 'tharroxx',
    'omni', 'gypsy', 'zen', 'esteem', 'balenosedan', 'wagonr', 'alto', 'versa',
    'swift', 'zenestilo', 'dzire', 'astar', 'ritz', 'eeco', 'kizashi', 'ertiga',
    'celerio', 'ciaz', 'scross', 'balenohatchback', 'vitarabrezza', 'ignis',
    'spresso', 'grandvitara', 'fronx', 'jimny', 'invicto', 'evitara', 'landmaster',
    'ambassador', 'contessa', 'trekka', 'veer', 'trax', 'gama', 'cruiser', 'toofan',
    'gurkha', 'one', 'citiline', 'traveller', 'urbania', 'padmini', 'rio', 'qute',
    'defy', 'veer', 'shul', 'ekonk', 'testarossa', 'enzo', 'california', 'ff',
    'laferrari', 'portofino', 'roma', 'purosangue', 'miura', 'espada', 'islero',
    'jarama', 'urraco', 'countach', 'silhouette', 'jalpa', 'diablo', 'murcilago',
    'gallardo', 'reventn', 'aventador', 'sestoelemento', 'veneno', 'huracn',
    'centenario', 'urus', 'revuelto', 'temerario', 'senna', 'speedtail', 'gt',
    'elva', 'artura', 'solusgt', 'gts', 'veyron', 'chiron', 'divo', 'centodieci',
    'bolide', 'mistral', 'tourbillon', 'cc', 'ccr', 'ccx', 'ccxr', 'trevita',
    'agera', 'agerar', 'ageras', 'regera', 'jesko', 'gemera', 'zonda', 'huayra',
    'utopia', 'imola', 'boxster', 'cayman', 'cayenne', 'carreragt', 'panamera',
    'macan', 'taycan', 'silverghost', 'phantom', 'twenty', 'wraith', 'silverwraith',
    'silverdawn', 'silvercloud', 'silvershadow', 'corniche', 'camargue',
    'silverspirit', 'silverspur', 'silverseraph', 'ghost', 'wraithmodern', 'dawn',
    'cullinan', 'spectre', 'droptail', 'speedsix', 'markv', 'markvi', 'rtype',
    'mulsanne', 'eight', 'turbor', 'continentalr', 'continentalt', 'azure',
    'arnage', 'brooklands', 'continentalgt', 'flyingspur', 'bentayga', 'bacalar',
    'batur', 'coalscuttle', 'dbmkiii', 'dbs', 'lagonda', 'virage', 'rapide',
    'cygnet', 'vulcan', 'dbssuperleggera', 'dbx', 'valkyrie', 'valhalla',
    'valiant', 'sebring', 'mistral', 'quattroporte', 'mexico', 'ghibli', 'indy',
    'bora', 'merak', 'khamsin', 'kyalami', 'biturbo', 'karif', 'shamal', 'coup',
    'spyder', 'granturismo', 'grancabrio', 'levante', 'grecale', 'granturismonew',
    'seven', 'elite', 'elan', 'cortina', 'europa', 'clat', 'esprit', 'excel',
    'elise', 'exige', 'evora', 'evija', 'emira', 'eletre', 'emeya', 'conceptone',
    'nevera', 'zeppelin', 'landaulet', 'exelero'
]
chars_cars = sorted(list(set(''.join(car_brands))))
stoi_cars = {s:i+1 for i,s in enumerate(chars_cars)}
stoi_cars['.'] = 0
itos_cars = {i:s for s,i in stoi_cars.items()}
num_classes_cars = len(stoi_cars)
X_cars, Y_cars = [], []
for w in car_brands:
    chs = ['.', '.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi_cars[ch1]; ix2 = stoi_cars[ch2]; ix3 = stoi_cars[ch3]
        X_cars.append(ix1 * num_classes_cars + ix2)
        Y_cars.append(ix3)
X_cars = torch.tensor(X_cars)
Y_cars = torch.tensor(Y_cars)
g_cars = torch.Generator().manual_seed(2147483647)
W_cars = torch.randn((num_classes_cars*num_classes_cars, num_classes_cars), generator=g_cars, requires_grad=True).to(device)
for k in range(1000):
    logits = W_cars[X_cars]
    loss = F.cross_entropy(logits, Y_cars) + 0.01* (W_cars**2).mean()
    W_cars.grad = None
    loss.backward()
    W_cars.data += -0.5 * W_cars.grad
print(f"Car model trained. Final loss: {loss.item():.4f}")
print("YOUR NEW CAR NAMES-")
for i in range(10):
    out = []
    context_idx = 0
    while True:
        logits = W_cars[context_idx]
        probs = F.softmax(logits, dim=0)
        ix_next = torch.multinomial(probs, num_samples=1).item()
        if ix_next == 0: break
        out.append(itos_cars[ix_next])
        context_idx = (context_idx % num_classes_cars) * num_classes_cars + ix_next
    print(''.join(out))