In [1]:
import torch
import torch.nn.functional as F

# Nilai aktual
y_actual = torch.tensor([-0.006289])  # nilai aktual

# Prediksi dari model
y_pred = torch.tensor([2.71153], requires_grad=True)  # prediksi, dengan gradien diizinkan

# Derajat kebebasan
nu = 3.0  # nilai tetap

# Parameter normalisasi
Z = 0.0  # misalnya

# Hitung selisih
y_diff = y_actual - y_pred  # menghitung selisih antara nilai aktual dan prediksi

# Hitung NLL
nll = -0.5 * (nu + 1) * torch.log1p((y_diff ** 2) / nu) - Z  # Menggunakan log1p untuk stabilitas

# Hitung propagasi mundur
nll.backward()  # menghitung gradien dari NLL terhadap semua parameter yang memiliki requires_grad=True

# Ambil gradien dari prediksi
grad_y_pred = y_pred.grad  # mendapatkan gradien terhadap prediksi

# Tentukan learning rate
learning_rate = 0.01

# Perbarui parameter prediksi
y_pred.data -= learning_rate * grad_y_pred  # memperbarui nilai prediksi berdasarkan gradien

# Tampilkan hasil
print("Gradien terhadap prediksi:", grad_y_pred.item())
print("Prediksi setelah pembaruan:", y_pred.data.item())
print("Nilai NLL:", nll.item())


Gradien terhadap prediksi: -1.0466697216033936
Prediksi setelah pembaruan: 2.721996784210205
Nilai NLL: -2.4837968349456787


In [2]:
import torch
import torch.distributions as dist

# Data
# y_actual = torch.tensor([0.7738])
# y_pred = torch.tensor([0.8000, 0.5400, 0.5500, 0.6800])
y_actual = torch.tensor([-0.006289])  # nilai aktual

# Prediksi dari model
y_pred = torch.tensor([2.71153], requires_grad=True) 
print(y_actual*y_pred)
variance = torch.tensor([2.11])
beta = 0.0

# Distribusi Normal untuk setiap prediksi
nll_losses = []
for i in range(len(y_actual)):
    studentt = dist.StudentT(loc=y_pred[i], scale=torch.sqrt(variance[i]), df=3)
    print(studentt)
    nll = -studentt.log_prob(y_actual)
    print(nll)
    
    # Jika β-weighting digunakan
    if beta > 0:
        nll = nll * (variance[i] ** beta)
    print(nll.item())
    nll_losses.append(nll.item())

# Print hasil NLL untuk setiap baris
for i, nll in enumerate(nll_losses):
    print(f"NLL untuk baris {i+1}: {nll:.6f}")

# Total NLL (sum atau average bisa dipilih)
total_nll = sum(nll_losses)
print(f"Total NLL: {total_nll:.6f}")


tensor([-0.0171], grad_fn=<MulBackward0>)


StudentT(df: 3.0, loc: 2.7115299701690674, scale: 1.452583909034729)
tensor([2.9208], grad_fn=<NegBackward0>)
2.920837163925171
NLL untuk baris 1: 2.920837
Total NLL: 2.920837


In [4]:
import torch
import torch.nn as nn
import numpy as np

# Data Input (8 data pertama) dan Target (2 data terakhir)
data = torch.tensor([
    0.00165, 0.007652, 0.010573, 0.015362, 
    0.008502, 0.010746, -0.000135, -0.003359, 
    -0.003548, -0.006289
], dtype=torch.float32)

X = data[:8]  # Input
print(X)
Y_actual = data[8:]  # Target
print(Y_actual)

# Simple Decoder-Only Model (Lag-Llama Style)
class LagLlama(nn.Module):
    def __init__(self, input_size):
        super(LagLlama, self).__init__()
        self.rmsnorm = nn.LayerNorm(input_size)
        self.attention = nn.MultiheadAttention(embed_dim=input_size, num_heads=1)
        self.fc = nn.Linear(input_size, 1)  # Output layer

    def forward(self, x):
        x = self.rmsnorm(x)
        attn_output, _ = self.attention(x, x, x)
        output = self.fc(attn_output)
        return output

# Inisialisasi Model
model = LagLlama(input_size=8)

# Prediksi Pertama (x₉)
X_input = X.unsqueeze(0)  # Tambahkan batch dimension
print(X_input)
pred_9 = model(X_input)
pred_9 = pred_9.squeeze(-1)  # Hilangkan dimensi ekstra jika ada
print(f"Prediksi x₉: {pred_9.item():.6f}")

# Prediksi Kedua (x₁₀)
# Pastikan pred_9 memiliki dimensi yang konsisten dengan X[1:]
X_input_next = torch.cat([X[1:], pred_9.view(1)], dim=0).unsqueeze(0)  # Shift input
print(X_input_next)

pred_10 = model(X_input_next)
print(f"Prediksi x₁₀: {pred_10.item():.6f}")

tensor([ 0.0016,  0.0077,  0.0106,  0.0154,  0.0085,  0.0107, -0.0001, -0.0034])
tensor([-0.0035, -0.0063])
tensor([[ 0.0016,  0.0077,  0.0106,  0.0154,  0.0085,  0.0107, -0.0001, -0.0034]])
Prediksi x₉: 0.326751
tensor([[ 7.6520e-03,  1.0573e-02,  1.5362e-02,  8.5020e-03,  1.0746e-02,
         -1.3500e-04, -3.3590e-03,  3.2675e-01]], grad_fn=<UnsqueezeBackward0>)
Prediksi x₁₀: 0.741542


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Data token (7 timesteps, 4 fitur per timestep)
tokens = torch.tensor([[ 0.4697,  0.2380, -0.2380],
        [ 0.8495,  0.4697,  0.2380],
        [ 0.3054,  0.8495,  0.4697],
        [ 0.4834,  0.3054,  0.8495],
        [-0.3796,  0.4834,  0.3054],
        [-0.6353, -0.3796,  0.4834],
        [-0.6503, -0.6353, -0.3796]], dtype=torch.float32)  # Shape: (7, 4)

# Definisikan layer embedding linear
embedding_layer = nn.Linear(3, 4)  # Proyeksi dari 4 fitur ke 8 dimensi
print(embedding_layer)

# Terapkan embedding
embedded_tokens = embedding_layer(tokens)
print("Embedded Tokens:\n", embedded_tokens)
print("Shape:", embedded_tokens.shape)

Linear(in_features=3, out_features=4, bias=True)
Embedded Tokens:
 tensor([[ 0.4167,  0.0723,  0.3051, -0.2716],
        [ 0.3404,  0.1797,  0.1877, -0.1961],
        [ 0.3814, -0.2512,  0.1169,  0.1060],
        [ 0.3181,  0.0702,  0.4226, -0.1643],
        [ 0.4571, -0.4543,  0.4006,  0.0442],
        [ 0.4488, -0.2273,  0.9171, -0.3145],
        [ 0.5233, -0.1446,  0.9295, -0.4971]], grad_fn=<AddmmBackward0>)
Shape: torch.Size([7, 4])


In [None]:
class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization."""
    def __init__(self, size: int, eps: float = 1e-5):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(size))  # Parameter skala (trainable)
        self.eps = eps  # Untuk mencegah pembagian nol

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Hitung RMS (Root Mean Square)
        norm_x = x.pow(2).mean(dim=-1, keepdim=True)
        rms_x = torch.sqrt(norm_x + self.eps)
        
        # Normalisasi dan terapkan skala trainable
        x_normed = x / rms_x
        return self.scale * x_normed

# Contoh RMSNorm untuk embedded tokens
rmsnorm = RMSNorm(size=4)
print(rmsnorm)

normalized_tokens = rmsnorm(embedded_tokens)
print("Normalized Tokens:\n", normalized_tokens)
print("Shape:", normalized_tokens.shape)

RMSNorm()
Normalized Tokens:
 tensor([[ 1.4173,  0.2459,  1.0378, -0.9238],
        [ 1.4454,  0.7628,  0.7970, -0.8325],
        [ 1.5784, -1.0398,  0.4838,  0.4388],
        [ 1.1395,  0.2514,  1.5137, -0.5886],
        [ 1.2026, -1.1954,  1.0540,  0.1163],
        [ 0.8217, -0.4162,  1.6792, -0.5759],
        [ 0.8828, -0.2439,  1.5679, -0.8384]], grad_fn=<MulBackward0>)
Shape: torch.Size([7, 4])


In [None]:
class QKVProjection(nn.Module):
    def __init__(self, emb_dim, head_dim):
        super().__init__()
        self.q_proj = nn.Linear(emb_dim, head_dim, bias=False)
        self.k_proj = nn.Linear(emb_dim, head_dim, bias=False)
        self.v_proj = nn.Linear(emb_dim, head_dim, bias=False)

    def forward(self, x):
        Q = self.q_proj(x)  # Proyeksi query
        K = self.k_proj(x)  # Proyeksi key
        V = self.v_proj(x)  # Proyeksi value
        return Q, K, V

# Inisialisasi dengan ukuran embedding 8 dan head dimension 8
qkv_layer = QKVProjection(emb_dim=4, head_dim=4)
print(qkv_layer)

Q, K, V = qkv_layer(normalized_tokens)

print("Matriks Q:\n", Q)
print("Matriks K:\n", K)
print("Matriks V:\n", V)
print(Q.shape)
print(K.shape)
print(V.shape)

QKVProjection(
  (q_proj): Linear(in_features=4, out_features=4, bias=False)
  (k_proj): Linear(in_features=4, out_features=4, bias=False)
  (v_proj): Linear(in_features=4, out_features=4, bias=False)
)
Matriks Q:
 tensor([[ 0.8554, -0.0846,  0.0809,  0.3194],
        [ 1.0166,  0.1040,  0.2562,  0.2669],
        [ 0.6709,  0.5092,  0.2853,  0.9109],
        [ 0.9918, -0.2330, -0.1799,  0.5199],
        [ 0.5214,  0.0289, -0.1140,  0.8195],
        [ 0.6369, -0.4936, -0.4433,  0.5148],
        [ 0.6128, -0.5068, -0.3746,  0.3720]], grad_fn=<MmBackward0>)
Matriks K:
 tensor([[-0.1401,  0.4000,  0.1707,  0.1956],
        [-0.2047,  0.4241, -0.1946,  0.2855],
        [ 0.6553,  0.3508, -0.1717, -0.6181],
        [-0.2861,  0.2350,  0.2838,  0.4305],
        [ 0.3966,  0.2196,  0.3934, -0.3465],
        [-0.2144,  0.1241,  0.7472,  0.3202],
        [-0.2629,  0.1821,  0.7336,  0.3420]], grad_fn=<MmBackward0>)
Matriks V:
 tensor([[ 0.7864,  0.8289, -0.6233, -0.5611],
        [ 0.8741,  0.68

In [None]:
def rotate_half(x):
    """Pisahkan separuh dimensi dan rotasi bagian kedua."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

def apply_rope(q, k, cos, sin, position_ids):
    """Terapkan RoPE dengan cosine dan sine pada Q dan K."""
    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
    sin = sin[position_ids].unsqueeze(1)
    print(cos)
    print(sin)  # [bs, 1, seq_len, dim]
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

# Terapkan RoPE pada Q dan K
# Contoh cosine dan sine
cos = torch.cos(torch.arange(0, 10).unsqueeze(1))  # Simulasi cosine untuk 10 posisi
sin = torch.sin(torch.arange(0, 10).unsqueeze(1))  # Simulasi sine untuk 10 posisi

# Terapkan RoPE pada Q dan K
position_ids = torch.arange(Q.shape[0])  # [0, 1, 2, ..., 9]
Q_rope, K_rope = apply_rope(Q, K, cos, sin, position_ids)

print("Q dengan RoPE:\n", Q_rope)
print("K dengan RoPE:\n", K_rope)
print(Q_rope.shape, K_rope.shape)

tensor([[ 1.0000],
        [ 0.5403],
        [-0.4161],
        [-0.9900],
        [-0.6536],
        [ 0.2837],
        [ 0.9602]])
tensor([[ 0.0000],
        [ 0.8415],
        [ 0.9093],
        [ 0.1411],
        [-0.7568],
        [-0.9589],
        [-0.2794]])
Q dengan RoPE:
 tensor([[ 0.8554, -0.0846,  0.0809,  0.3194],
        [ 0.3337, -0.1684,  0.9939,  0.2317],
        [-0.5386, -1.0402,  0.4913,  0.0840],
        [-0.9565,  0.1573,  0.3181, -0.5476],
        [-0.4271,  0.6013, -0.3201, -0.5575],
        [-0.2445,  0.3536, -0.7365,  0.6193],
        [ 0.4837, -0.3827, -0.5309,  0.4988]], grad_fn=<AddBackward0>)
K dengan RoPE:
 tensor([[-0.1401,  0.4000,  0.1707,  0.1956],
        [ 0.0532, -0.0111, -0.2774,  0.5112],
        [-0.1166,  0.4160,  0.6673,  0.5762],
        [ 0.2432, -0.2934, -0.3213, -0.3930],
        [ 0.0385, -0.4058, -0.5573,  0.0603],
        [ 0.6557,  0.3423,  0.4175, -0.0281],
        [-0.0474,  0.2704,  0.7779,  0.2775]], grad_fn=<AddBackward0>)
torch.

In [None]:
def masked_causal_attention(Q, K, V, mask=None):
    """
    Implementasi Masked Causal Self-Attention.
    """
    # Ukuran batch dan sequence length
    batch_size, seq_len, head_dim = Q.size()

    # Langkah 1: Hitung skor perhatian (Q @ K^T)
    attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(head_dim, dtype=torch.float32))

    # Langkah 2: Terapkan masking (optional)
    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))

    # Langkah 3: Softmax untuk normalisasi skor
    attention_weights = F.softmax(attention_scores, dim=-1)

    # Langkah 4: Kalikan dengan V untuk mendapatkan output
    attention_output = torch.matmul(attention_weights, V)

    return attention_output

# Contoh input: Matriks Q, K, dan V
batch_size, seq_len, head_dim = 1, 7, 4  # Satu batch, 7 timestep, 8 dimensi per token

Q_rope = torch.randn(batch_size, seq_len, head_dim)  # Query dengan RoPE
K_rope = torch.randn(batch_size, seq_len, head_dim)  # Key dengan RoPE
V = torch.randn(batch_size, seq_len, head_dim)       # Value

# Buat mask segitiga bawah untuk causal masking
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)  # (1, 1, 7, 7)
print(mask)
# Hitung attention output
attention_output = masked_causal_attention(Q_rope, K_rope, V, mask=mask)

print("Attention Output:\n", attention_output)
print("Shape:", attention_output.shape)

tensor([[[[1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1.]]]])
Attention Output:
 tensor([[[[ 1.2906, -0.0791, -0.3988,  1.7156],
          [ 0.6659,  0.2351, -0.2672,  0.2181],
          [-0.1676,  0.0192, -0.0077,  0.4206],
          [-0.7717,  0.1047,  0.8664,  0.0109],
          [-0.4598,  0.4282,  1.5111, -0.0337],
          [-0.1040,  0.4174,  0.7301, -0.1205],
          [-0.6456,  0.5434,  1.3861, -0.1357]]]])
Shape: torch.Size([1, 1, 7, 4])


In [None]:
class ResidualLayer(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.rms_norm = RMSNorm(size=emb_dim)  # RMSNorm setelah residual connection

    def forward(self, input_tokens, attention_output):
        # Residual connection: Tambahkan input ke attention output
        residual_output = input_tokens + attention_output

        # Terapkan RMSNorm
        normalized_output = self.rms_norm(residual_output)
        return normalized_output

# Inisialisasi residual layer
residual_layer = ResidualLayer(emb_dim=4)  # Dimensi embedding = 8

# Input awal (embedded tokens) dan attention output
# embedded_tokens = torch.randn(1, 7, 4)  # Contoh embedded tokens (batch=1, timesteps=7, emb_dim=8)
# attention_output = torch.randn(1, 7, 4)  # Output dari masked causal self-attention

# Terapkan residual connection dan RMSNorm
residual_output = residual_layer(embedded_tokens, attention_output)

print("Residual Output:\n", residual_output)
print("Shape:", residual_output.shape)

Residual Output:
 tensor([[[[ 1.5257, -0.0061, -0.0837,  1.2904],
          [ 1.8438,  0.7600, -0.1457,  0.0403],
          [ 0.6855, -0.7443,  0.3501,  1.6891],
          [-0.6544,  0.2523,  1.8598, -0.2213],
          [-0.0029, -0.0273,  1.9998,  0.0110],
          [ 0.3944,  0.2174,  1.8840, -0.4976],
          [-0.1004,  0.3274,  1.9008, -0.5194]]]], grad_fn=<MulBackward0>)
Shape: torch.Size([1, 1, 7, 4])


In [None]:
class SwiGLU(nn.Module):
    def __init__(self, emb_dim, hidden_dim):
        super().__init__()
        # Dua layer linear untuk proyeksi
        self.fc1 = nn.Linear(emb_dim, hidden_dim)  # W1 dan b1
        self.fc2 = nn.Linear(emb_dim, hidden_dim)  # W2 dan b2
        self.fc_out = nn.Linear(hidden_dim, emb_dim)  # Output layer

    def forward(self, x):
        # Proyeksi pertama dengan SiLU
        x1 = F.silu(self.fc1(x))  # SiLU(W1 * x + b1)
        # Proyeksi kedua linear
        x2 = self.fc2(x)  # W2 * x + b2

        # Elemen-wise multiplication
        out = x1 * x2  # SwiGLU
        return self.fc_out(out)  # Output layer

# Inisialisasi layer SwiGLU
swiglu_layer = SwiGLU(emb_dim=4, hidden_dim=1)  # Embedding dim 8, hidden dim 32

# Contoh input: Output dari residual connection sebelumnya
# residual_output = torch.randn(1, 7, 4)  # Batch=1, Sequence Length=7, Embedding Dim=8

# Terapkan SwiGLU
feed_forward_output = swiglu_layer(residual_output)

print("Feed Forward Output (SwiGLU):\n", feed_forward_output)
print("Shape:", feed_forward_output.shape)


Feed Forward Output (SwiGLU):
 tensor([[[ 0.4490, -0.4811,  0.8380, -0.4820],
         [ 0.5467, -0.3862,  0.9281, -0.6283],
         [ 0.5068, -0.4250,  0.8913, -0.5685],
         [ 0.3958, -0.5327,  0.7891, -0.4026],
         [ 0.5174, -0.4147,  0.9011, -0.5844],
         [ 0.5551, -0.3781,  0.9358, -0.6408],
         [ 0.4588, -0.4716,  0.8471, -0.4967]]], grad_fn=<ViewBackward0>)
Shape: torch.Size([1, 7, 4])


In [None]:
class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization."""
    def __init__(self, size: int, eps: float = 1e-5):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(size))  # Skala trainable
        self.eps = eps  # Untuk mencegah pembagian nol

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Hitung RMS (Root Mean Square)
        norm_x = x.pow(2).mean(dim=-1, keepdim=True)
        rms_x = torch.sqrt(norm_x + self.eps)

        # Normalisasi dan skala
        x_normed = x / rms_x
        return self.scale * x_normed

class ResidualWithRMSNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.rms_norm = RMSNorm(size=emb_dim)

    def forward(self, input_tokens, feed_forward_output):
        # Residual Connection
        residual_output = input_tokens + feed_forward_output

        # RMSNorm
        normalized_output = self.rms_norm(residual_output)
        return normalized_output

# Inisialisasi residual layer dengan RMSNorm
residual_with_rmsnorm_layer = ResidualWithRMSNorm(emb_dim=4)

# Input: Output dari residual sebelumnya dan feed-forward output
# residual_input = torch.randn(1, 7, 4)  # (Batch=1, Seq Len=7, Emb Dim=8)
# feed_forward_output = torch.randn(1, 7, 4)  # Hasil dari SwiGLU

# Terapkan residual connection dan RMSNorm
final_output = residual_with_rmsnorm_layer(residual_input, feed_forward_output)

print("Final Output:\n", final_output)
print("Shape:", final_output.shape)


Final Output:
 tensor([[[ 1.4249,  0.2248,  1.3836,  0.0692],
         [ 0.0111, -0.8981,  0.5018, -1.7151],
         [ 1.8350, -0.5532, -0.5143, -0.2494],
         [ 0.1115, -0.7550, -0.0445, -1.8481],
         [-0.0155,  0.5651,  1.7255,  0.8385],
         [ 0.4516, -1.2546,  0.6572, -1.3379],
         [-0.3666, -1.6895,  0.8242, -0.5762]]], grad_fn=<MulBackward0>)
Shape: torch.Size([1, 7, 4])


In [None]:
final_output.view(-1,4).shape

torch.Size([7, 4])

In [None]:
from gluonts.torch.distributions import StudentTOutput

class DistributionHead(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.param_proj = nn.Linear(emb_dim, 3)
        self.distr_output = StudentTOutput()

    def forward(self, x):
        distr_params = self.param_proj(x)

        df = F.softplus(distr_params[..., 0]) + 1e-5
        loc = distr_params[..., 1]
        scale = F.softplus(distr_params[..., 2])

        distr = self.distr_output.distribution(df=df, loc=loc, scale=scale)

        return distr, loc, scale

In [None]:
distribution_head = DistributionHead(emb_dim=4)
final_output = final_output

distr, loc, scale = distribution_head(final_output.view(-1,4))
predictions = distr.sample()

KITA PERBARUI

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# Contoh data time series
data = {
    'date': pd.date_range(start='2013-02-06', periods=10, freq='D'),
    'value': [-0.23802, 0.23802, 0.46969, 0.84952, 0.30543, 
              0.48341, -0.37959, -0.6353, -0.65029, -0.86769]
}
df = pd.DataFrame(data)

# Membuat lagged features
def create_lagged_features(df, lag):
    for i in range(1, lag + 1):
        df[f'lag_{i}'] = df['value'].shift(i)
    df = df.dropna()  # Hapus nilai NaN
    return df

print(df)

# Menambahkan 3 lag features
df_lagged = create_lagged_features(df, lag=3)
print(df_lagged)

# Data fitur (X) dan target (y)
X = df_lagged.drop(columns=['date', 'value']).values
y = df_lagged['value'].values

# Mengonversi ke tensor
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
print('X_tensor:', X_tensor)
print('y_tensor:', y_tensor)


        date    value
0 2013-02-06 -0.23802
1 2013-02-07  0.23802
2 2013-02-08  0.46969
3 2013-02-09  0.84952
4 2013-02-10  0.30543
5 2013-02-11  0.48341
6 2013-02-12 -0.37959
7 2013-02-13 -0.63530
8 2013-02-14 -0.65029
9 2013-02-15 -0.86769
        date    value    lag_1    lag_2    lag_3
3 2013-02-09  0.84952  0.46969  0.23802 -0.23802
4 2013-02-10  0.30543  0.84952  0.46969  0.23802
5 2013-02-11  0.48341  0.30543  0.84952  0.46969
6 2013-02-12 -0.37959  0.48341  0.30543  0.84952
7 2013-02-13 -0.63530 -0.37959  0.48341  0.30543
8 2013-02-14 -0.65029 -0.63530 -0.37959  0.48341
9 2013-02-15 -0.86769 -0.65029 -0.63530 -0.37959
X_tensor: tensor([[ 0.4697,  0.2380, -0.2380],
        [ 0.8495,  0.4697,  0.2380],
        [ 0.3054,  0.8495,  0.4697],
        [ 0.4834,  0.3054,  0.8495],
        [-0.3796,  0.4834,  0.3054],
        [-0.6353, -0.3796,  0.4834],
        [-0.6503, -0.6353, -0.3796]])
y_tensor: tensor([ 0.8495,  0.3054,  0.4834, -0.3796, -0.6353, -0.6503, -0.8677])


In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dim):
        super(DecoderBlock, self).__init__()
        self.self_attn = nn.MulheadAttention(embed_dim=embedding_dim, num_heads=4)
        self.fc = nn.Linear(embedding_dim, embedding_dim)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # Self-attention
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)  # Residual connection
        x = self.norm1(x)

        # Feedforward
        ff_output = self.fc(x)
        x = x + self.dropout(ff_output)  # Residual connection
        x = self.norm2(x)

        return x


In [None]:
class LLAMA(nn.Module):
    def __init__(self, input_size, embedding_dim, num_decoder_layers):
        super(LLAMA, self).__init__()
        self.embedding = nn.Linear(input_size, embedding_dim)  # Layer embedding
        self.rmsnorm = nn.LayerNorm(embedding_dim)  # Normalisasi
        self.decoder_blocks = nn.ModuleList([DecoderBlock(embedding_dim) for _ in range(num_decoder_layers)])
        self.fc = nn.Linear(embedding_dim, 1)  # Layer output untuk prediksi

    def forward(self, x):
        x = self.embedding(x)  # Proses embedding
        x = self.rmsnorm(x)    # Normalisasi
        for decoder in self.decoder_blocks:
            x = decoder(x)
        x = self.fc(x)         # Layer output
        return x

# Parameter model
input_size = X_tensor.shape[1]
print(input_size)  # Jumlah fitur
embedding_dim = 8             # Dimensi embedding
model = LLAMA(input_size, embedding_dim, 3)

def get_embeddings(model, data):
    model.eval()  # Set model ke evaluasi
    with torch.no_grad():
        embeddings = model.embedding(data)  # Mendapatkan hasil embedding
        return embeddings

# Mendapatkan hasil embedding untuk data fitur
embeddings = get_embeddings(model, X_tensor)

# Menampilkan hasil embedding
print("Hasil Embedding:")
print(embeddings)

3
Hasil Embedding:
tensor([[-0.6979,  0.5118, -0.0960,  0.1352,  0.7720,  0.3822,  0.3115,  0.4134],
        [-0.5246,  0.8096, -0.1240,  0.1079,  0.6297,  0.5786,  0.4924,  0.7130],
        [-0.4350,  0.7897, -0.1868, -0.1055,  0.2342,  0.3228,  0.7014,  0.3502],
        [-0.2262,  0.6745,  0.0572, -0.2512,  0.2114,  0.6158,  0.7440,  0.7986],
        [-0.4469,  0.3568, -0.0412, -0.3047,  0.1281,  0.1122,  0.6536,  0.0475],
        [-0.2809, -0.0806,  0.3098, -0.5506,  0.1044,  0.2718,  0.6142,  0.3015],
        [-0.6369, -0.2960,  0.2602, -0.2688,  0.5860,  0.1362,  0.2395,  0.0709]])


In [None]:
import torch

# Data harga saham yang diberikan
data = {
    "2013-02-06": 0.00165,
    "2013-02-07": 0.007652,
    "2013-02-08": 0.010573,
    "2013-02-09": 0.015362,
    "2013-02-10": 0.008502,
    "2013-02-11": 0.010746,
    "2013-02-12": -0.000135,
    "2013-02-13": -0.003359,
    "2013-02-14": -0.003548,
    "2013-02-15": -0.006289
}

# Mengkonversi data ke tensor PyTorch
dates = list(data.keys())
print(dates)
values = list(data.values())
print(values)
values_tensor = torch.tensor(values, dtype=torch.float32)
print(values_tensor)

# Menentukan lag sequence
lags_seq = [1, 2, 3]  # menggunakan lag 1, 2, dan 3 hari

def lagged_sequence_values(lags_seq, values_tensor):
    n = len(values_tensor)
    max_lag = max(lags_seq)
    print(n,max_lag)
    # Inisialisasi tensor untuk menyimpan lagged features
    lagged_features = torch.zeros((n-max_lag, len(lags_seq)))
    print(lagged_features)
    for idx, lag in enumerate(lags_seq):
        print(idx,lag)
        lagged_features[:, idx] = values_tensor[max_lag-lag:n-lag]
        print(lagged_features)
    return lagged_features

# Mendapatkan fitur lag
lag_features = lagged_sequence_values(lags_seq, values_tensor)

print("Lag features (tokenized input):")
print(lag_features)


['2013-02-06', '2013-02-07', '2013-02-08', '2013-02-09', '2013-02-10', '2013-02-11', '2013-02-12', '2013-02-13', '2013-02-14', '2013-02-15']
[0.00165, 0.007652, 0.010573, 0.015362, 0.008502, 0.010746, -0.000135, -0.003359, -0.003548, -0.006289]
tensor([ 0.0016,  0.0077,  0.0106,  0.0154,  0.0085,  0.0107, -0.0001, -0.0034,
        -0.0035, -0.0063])
10 3
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])
0 1
tensor([[ 0.0106,  0.0000,  0.0000],
        [ 0.0154,  0.0000,  0.0000],
        [ 0.0085,  0.0000,  0.0000],
        [ 0.0107,  0.0000,  0.0000],
        [-0.0001,  0.0000,  0.0000],
        [-0.0034,  0.0000,  0.0000],
        [-0.0035,  0.0000,  0.0000]])
1 2
tensor([[ 0.0106,  0.0077,  0.0000],
        [ 0.0154,  0.0106,  0.0000],
        [ 0.0085,  0.0154,  0.0000],
        [ 0.0107,  0.0085,  0.0000],
        [-0.0001,  0.0107,  0.0000],
        [-0.0034, -0.0001,  0.0000]

# EMBEDDING

In [None]:
import torch
from torch import nn
import math

class LlamaRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.einsum("n,d->nd", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer(
            "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
        )
        self.register_buffer(
            "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
        )

    def forward(self, device, dtype, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=device, dtype=dtype)

        return (
            self.cos_cached[:, :, :seq_len, ...].to(dtype=dtype),
            self.sin_cached[:, :, :seq_len, ...].to(dtype=dtype),
        )


In [None]:
class LagLlamaModelEmb(nn.Module):
    def __init__(self, input_size, embedding_dim):
        super(LagLlamaModelEmb, self).__init__()
        self.input_projection = nn.Linear(input_size, embedding_dim)  # Layer linear untuk embedding
        self.embedding_dim = embedding_dim
        self.rotary_embedding = LlamaRotaryEmbedding(embedding_dim)

    def forward(self, x):
        x = self.input_projection(x)  # Aplikasi embedding linear
        cos, sin = self.rotary_embedding(x.device, x.dtype, x.shape[1])
        return x, cos, sin


In [None]:
# Pengaturan model
input_size = 3  # Misalkan input memiliki 3 fitur
embedding_dim = 1  # Ukuran embedding yang diinginkan
model = LagLlamaModelEmb(input_size, embedding_dim)

# Input yang diberikan
lag_features_new = lag_features.unsqueeze(0)
print(lag_features_new)  # Menambahkan dimensi batch untuk konsistensi

# Penggunaan model
output, cos, sin = model(lag_features)
print("Output of Rotary Embedding:", output)
print("Cosine Embedding:", cos)
print("Sine Embedding:", sin)

tensor([[[ 0.0106,  0.0077,  0.0016],
         [ 0.0154,  0.0106,  0.0077],
         [ 0.0085,  0.0154,  0.0106],
         [ 0.0107,  0.0085,  0.0154],
         [-0.0001,  0.0107,  0.0085],
         [-0.0034, -0.0001,  0.0107],
         [-0.0035, -0.0034, -0.0001]]])
Output of Rotary Embedding: tensor([[0.0716],
        [0.0753],
        [0.0750],
        [0.0770],
        [0.0718],
        [0.0712],
        [0.0668]], grad_fn=<AddmmBackward0>)
Cosine Embedding: tensor([[[[1., 1.]]]])
Sine Embedding: tensor([[[[0., 0.]]]])


# DECODER

RMSNorm 1

In [None]:
class RMSNorm(nn.Module):
    def __init__(self, size, eps=1e-5):
        super(RMSNorm, self).__init__()
        self.size = size
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(size))

    def forward(self, x):
        # Menghitung mean square
        mean_square = x.pow(2).mean(-1, keepdim=True)
        # Normalisasi
        normalized_x = x * torch.rsqrt(mean_square + self.eps)
        return self.scale * normalized_x

In [None]:
class LagLlamaModelNorm1(nn.Module):
    def __init__(self, input_size, embedding_dim):
        super(LagLlamaModelNorm1, self).__init__()
        self.input_projection = nn.Linear(input_size, embedding_dim)  # Layer linear untuk embedding
        self.embedding_dim = embedding_dim
        self.rotary_embedding = LlamaRotaryEmbedding(embedding_dim)
        self.rms_norm = RMSNorm(embedding_dim)

    def forward(self, x):
        x = self.input_projection(x)  # Aplikasi embedding linear
        x = self.rms_norm(x)
        cos, sin = self.rotary_embedding(x.device, x.dtype, x.shape[1])
        return x, cos, sin


In [None]:
model = LagLlamaModelNorm1(input_size=3, embedding_dim=8)  # Misalnya batch size 1, sequence length 7, feature size 3
output_rms_1 = model(lag_features_new)
print("Output after RMSNorm:", output_rms_1)

Output after RMSNorm: (tensor([[[-1.3549, -1.0243, -1.2932, -1.4913,  0.1329,  1.0596,  0.2657,
           0.0835],
         [-1.3541, -1.0176, -1.2958, -1.4855,  0.1396,  1.0681,  0.2737,
           0.0951],
         [-1.3721, -1.0078, -1.2946, -1.4787,  0.1280,  1.0649,  0.2840,
           0.0870],
         [-1.3671, -1.0144, -1.2945, -1.4760,  0.1437,  1.0666,  0.2824,
           0.0944],
         [-1.3817, -1.0133, -1.2906, -1.4793,  0.1225,  1.0539,  0.2804,
           0.0728],
         [-1.3808, -1.0256, -1.2878, -1.4787,  0.1351,  1.0477,  0.2740,
           0.0705],
         [-1.3691, -1.0370, -1.2861, -1.4915,  0.1297,  1.0409,  0.2583,
           0.0617]]], grad_fn=<MulBackward0>), tensor([[[[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
            1.0000],
          [ 0.5403,  0.9950,  0.9999,  1.0000,  0.5403,  0.9950,  0.9999,
            1.0000],
          [-0.4161,  0.9801,  0.9998,  1.0000, -0.4161,  0.9801,  0.9998,
            1.0000],
          [-0.

CausalSelfAtt

In [None]:
def rotate_half(x):
    x1 = x[..., :x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, pos_emb):
    cos, sin = pos_emb[..., :q.shape[-1] // 2], pos_emb[..., q.shape[-1] // 2:]
    q_rot = (q * cos) + (rotate_half(q) * sin)
    k_rot = (k * cos) + (rotate_half(k) * sin)
    return q_rot, k_rot

In [None]:
class CausalSelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert self.head_dim * heads == self.embed_size, "Embed size needs to be divisible by heads"
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
        self.rotary_embedding = LlamaRotaryEmbedding(self.head_dim)

    def forward(self, values, keys, query, mask):
        N, value_len, key_len, query_len = values.shape[0], values.shape[1], keys.shape[1], query.shape[1]
        print("Initial shapes - Queries:", query.shape, "Keys:", keys.shape, "Values:", values.shape)
        
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(query)

        # Apply rotary positional embedding to queries and keys
        pos_emb = self.rotary_embedding(query, query_len)
        queries, keys = apply_rotary_pos_emb(queries, keys, pos_emb)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
        
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads * self.head_dim)
        out = self.fc_out(out)
        return out

In [None]:
class LagLlamaModelCaus(nn.Module):
    def __init__(self, input_size, embedding_dim, heads):
        super(LagLlamaModelCaus, self).__init__()
        self.input_projection = nn.Linear(input_size, embedding_dim)  # Layer linear untuk embedding
        self.embedding_dim = embedding_dim
        self.rotary_embedding = LlamaRotaryEmbedding(embedding_dim)
        self.rms_norm = RMSNorm(embedding_dim)
        self.causal_attention = CausalSelfAttention(embedding_dim, heads)

    def forward(self, x):
        x = self.input_projection(x)  # Aplikasi embedding linear
        x = self.rms_norm(x)
        cos, sin = self.rotary_embedding(x.device, x.dtype, x.shape[1])
        x = self.causal_attention(x,x,x,None)
        return x, cos, sin


In [None]:
heads = 8
model = LagLlamaModelCaus(input_size, embedding_dim, heads)
output = model(lag_features_new)
print("Output after CausalSelfAttention:", output)