In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 2500
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 140  # Reduced embedding size
n_head = 4    # Fewer attention heads
n_layer = 4   # Fewer transformer layers
dropout = 0.1  # Reduced dropout for faster convergence
# ------------

In [3]:
print(device)

cpu


In [4]:
torch.manual_seed(1337)

<torch._C.Generator at 0x1083902f0>

In [5]:
with open('../finalAssignment_musicDataset/inputMelodiesAugmented.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
#Printing first 2 lines of the dataset
with open('../finalAssignment_musicDataset/inputMelodiesAugmented.txt', 'r') as file:
    data = file.readlines()
    for line in data[:2]:
        print(line.strip())

R R R R R R R R R R R R R B c E E c c c c E E d R R R R R B B c E E c c c c E E d R R R R R c d E E d B g d d c c c B c B E E g g f g g A B c d E f R R R R R B B c E E c c c c E E d R R R R R B B c E E c c c c E E d R R R R R E E E E d B g d d c c c B c B E E g g f g g A B c d E f B B B g B f E g B f E g B f E c g A g g f B g B f E B g B f E B g B f E E g A g g f B g B f E B g B f E B g B f E B g B f E B g B f E R R g g B d E R R R R R R R R R R R R R R R R R R R R R R R d E d E d d g d c c c B c B g E g A E f g A B c E g f B g B f E g B f E g B f E c g A g g f B g B f E B g B f E B g B f E E g A g g f B g B f E B g B f E B g B f E B g B f E B g B f E G G E G E G G E G G E G G E G A B R R E G G E G E G G E G G E G G E D A G G G A f B B c E E c c c c E E d R R E B A g B B c E E c c c c E E d R R B B B g B f E g B f E g B f E c g A g g f B g B f E B g B f E B g B f E E g A g g f B g B f g B g f E G B g f E g E A g f B g c c E R R R g A B d E
R R R R R R R R R R R R R R R R R R R R R R R 

In [7]:
with open('../finalAssignment_musicDataset/inputMelodiesAugmented.txt', 'r') as file:
    melodies = file.readlines()

In [8]:
melody_tokens = [melody.strip().split() for melody in melodies]

In [9]:
print(melody_tokens[:1])

[['R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'B', 'c', 'E', 'E', 'c', 'c', 'c', 'c', 'E', 'E', 'd', 'R', 'R', 'R', 'R', 'R', 'B', 'B', 'c', 'E', 'E', 'c', 'c', 'c', 'c', 'E', 'E', 'd', 'R', 'R', 'R', 'R', 'R', 'c', 'd', 'E', 'E', 'd', 'B', 'g', 'd', 'd', 'c', 'c', 'c', 'B', 'c', 'B', 'E', 'E', 'g', 'g', 'f', 'g', 'g', 'A', 'B', 'c', 'd', 'E', 'f', 'R', 'R', 'R', 'R', 'R', 'B', 'B', 'c', 'E', 'E', 'c', 'c', 'c', 'c', 'E', 'E', 'd', 'R', 'R', 'R', 'R', 'R', 'B', 'B', 'c', 'E', 'E', 'c', 'c', 'c', 'c', 'E', 'E', 'd', 'R', 'R', 'R', 'R', 'R', 'E', 'E', 'E', 'E', 'd', 'B', 'g', 'd', 'd', 'c', 'c', 'c', 'B', 'c', 'B', 'E', 'E', 'g', 'g', 'f', 'g', 'g', 'A', 'B', 'c', 'd', 'E', 'f', 'B', 'B', 'B', 'g', 'B', 'f', 'E', 'g', 'B', 'f', 'E', 'g', 'B', 'f', 'E', 'c', 'g', 'A', 'g', 'g', 'f', 'B', 'g', 'B', 'f', 'E', 'B', 'g', 'B', 'f', 'E', 'B', 'g', 'B', 'f', 'E', 'E', 'g', 'A', 'g', 'g', 'f', 'B', 'g', 'B', 'f', 'E', 'B', 'g', 'B', 'f', 'E', 'B', 'g', 'B', 'f', 'E', 'B', 'g'

In [10]:
vocabulary = sorted(set(token for melody in melody_tokens for token in melody))
vocab_size = len(vocabulary)

token_to_id = {token: idx for idx, token in enumerate(vocabulary)}
id_to_token = {idx: token for token, idx in token_to_id.items()}

In [11]:
vocabulary

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'R', 'a', 'c', 'd', 'f', 'g']

In [12]:
encode = lambda s: [token_to_id[token] for token in s.split()]
decode = lambda l: ' '.join([id_to_token[i] for i in l])

In [13]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # first 90% for training, rest for validation
train_data = data[:n]
val_data = data[n:]

In [14]:
# data loading
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [16]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=True)
        self.query = nn.Linear(n_embd, head_size, bias=True)
        self.value = nn.Linear(n_embd, head_size, bias=True)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [17]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [18]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [19]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [20]:
class GPTMelodyModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [21]:
model = GPTMelodyModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

0.987853 M parameters


In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    perplexity = torch.exp(loss)
    
    if iter % eval_interval == 0 or iter == max_iters - 1:
        print(f"Step {iter}: Loss = {loss.item():.4f}, Perplexity = {perplexity.item():.4f}")
        
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.6006, val loss 2.6026
Step 0: Loss = 2.5963, Perplexity = 13.4137
step 500: train loss 1.5016, val loss 1.5164
Step 500: Loss = 1.5403, Perplexity = 4.6660
step 1000: train loss 1.4162, val loss 1.4495
Step 1000: Loss = 1.4228, Perplexity = 4.1485
step 1500: train loss 1.2234, val loss 1.2521
Step 1500: Loss = 1.2973, Perplexity = 3.6592
step 2000: train loss 1.1457, val loss 1.1770
Step 2000: Loss = 1.2207, Perplexity = 3.3895
step 2499: train loss 1.0847, val loss 1.1264
Step 2499: Loss = 1.1754, Perplexity = 3.2393


In [23]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

A g a B B B B B B B g B B B B B B B B B B B B C B B B B B B B B B B B C B B B B B B B B B B A A A A A A A A g g a a B B B B B B B C B B B B B B C B B B B B B A A A A A A A A A A A g g g a g g g g g g g g g a B B B B B B B B B B B B B C B B B B B B B C B B B B C B B B C B A A A g g g f g g a B R D B g B B B R R g g g g R R g B g g f f R g f R E g A B A B R R R g f E g A B A B R R R R R G E E E B F E E E B E B E E R R D D E B f E E E B R A B R E E B a A A R B A R B A E f R g A f g g d d d f E d E f E d d f D R R E E E E B R R R R R E E E E c E R E E E E E E E E f E E E E E E E E B c E E f f f f R R R g g g d f R f f f E d E d f E d d f d R R R R R g g g f f R R E f f d E d g f R g f f f E d c B R f f R R E c c B B B g a B a B A a g g B a g E d d a R d F d d d F R F d d C d g F F R G g d F G d G C R G g g F d G R R g a C d d a R a a d C d C C d F R G R g g g g F d F G R R g a C d d d a a g g G F R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R R 

In [24]:
class MarkovChainMelodyModel:
    def __init__(self, melody_tokens):
        self.token_to_id = token_to_id
        self.id_to_token = id_to_token
        self.vocab_size = vocab_size
        
        self.transition_matrix = np.zeros((self.vocab_size, self.vocab_size), dtype=np.float32)
        
        for melody in melody_tokens:
            for i in range(len(melody) - 1):
                current_token = token_to_id[melody[i]]
                next_token = token_to_id[melody[i + 1]]
                self.transition_matrix[current_token][next_token] += 1
        
        for i in range(self.vocab_size):
            row_sum = np.sum(self.transition_matrix[i])
            if row_sum > 0:
                self.transition_matrix[i] /= row_sum

    def generate(self, start_token, max_new_tokens):
        sequence = [start_token]
        for _ in range(max_new_tokens - 1):
            last_token = sequence[-1]
            next_token_probs = self.transition_matrix[last_token]
            next_token = np.random.choice(range(self.vocab_size), p=next_token_probs)
            sequence.append(next_token)
        return sequence

markov_model = MarkovChainMelodyModel(melody_tokens)

start_token = random.choice(range(vocab_size))
generated_sequence = markov_model.generate(start_token, max_new_tokens=500)

generated_melody = decode(generated_sequence)
print(generated_melody)

B B B A G A B f c D c B f f g d c B B c d D c c f g A f E B B d d d c R R R R R R R R R R R f R R R R R f f E A C D C C a a d d F d F d d F F F F E E f A C C F c c c C F F R f R R R R R R R R A A G G d d d C D B d F d d f f D C F G d C a D E E c D B B B E c c d C a g R R R R B B c g A R d C C a c d a R R R F C a a G F F D c d F F E R R g a g B d c c c D F G G g f f c c C a A A A D B B B B G G G F C C C d d d g B B A G F F d R R R R d E g c a g g a F G G A D C G a C G G f g f f D C C d c d F d c c d F G G A A C a a F F R R f g C G E D R R R R R R R R R c R R R F F F d d f f E F R R a B B G G A A A B g g g g f A f G G G E g A R R R R R R E C d d f A E f A R A A G F F D E g g a a F F c f E D E E G F R R a a c E E A R d F R R R R R R C B B B B c a a g F f E R R R R d f E D D C D C F F R R a a C C c B B c c c c B A G g g f f a a g A B f f f D E F F R R R R F R R a d a R R R R R R R R R D F A G G F d F E c c E F d c D D C B E D d a a d d d C a C D F d F F F F R R R A A G F D G f f d C G A E


In [25]:
def jaccard_similarity(sequence1, sequence2):
    min_len = min(len(sequence1), len(sequence2))
    sequence1 = sequence1[:min_len]
    sequence2 = sequence2[:min_len]
    
    set1 = set(sequence1)
    set2 = set(sequence2)
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

context = torch.zeros((1, 1), dtype=torch.long, device=device)
gpt_generated_sequence1 = m.generate(context, max_new_tokens=500)[0].tolist()
gpt_generated_sequence2 = m.generate(context, max_new_tokens=500)[0].tolist()

start_token = random.choice(range(vocab_size))
markov_generated_sequence1 = markov_model.generate(start_token, max_new_tokens=500)
markov_generated_sequence2 = markov_model.generate(start_token, max_new_tokens=500)

jaccard_sim = jaccard_similarity(gpt_generated_sequence1, gpt_generated_sequence2)
print(f"Jaccard Similarity of GPT Generated Sequence: {jaccard_sim:.2f}")

Jaccard Similarity of GPT Generated Sequence: 0.92


In [26]:
jaccard_sim_markov = jaccard_similarity(markov_generated_sequence1, markov_generated_sequence2)
print(f"Jaccard Similarity of Markov Generated Sequence: {jaccard_sim_markov:.2f}")

Jaccard Similarity of Markov Generated Sequence: 1.00


In [27]:
def perplexity(sequence, markov_model):
    log_prob_sum = 0
    N = len(sequence)
    
    for i in range(1, N):
        prev_token = sequence[i-1]
        curr_token = sequence[i]
        
        transition_prob = markov_model.transition_matrix[prev_token][curr_token]
        
        if transition_prob == 0:
            transition_prob = 1e-10
            
        log_prob_sum += np.log2(transition_prob)
    
    perplexity = 2 ** (-log_prob_sum / (N - 1))
    return perplexity
    
markov_generated_sequence1 = [random.choice(range(vocab_size)) for _ in range(500)]
perplexity_markov = perplexity(generated_sequence, markov_model)

print(f"Perplexity of Markov generated sequence: {perplexity_markov:.2f}")

Perplexity of Markov generated sequence: 7.38
