# Package Import & File Checking

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as Fun
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Simple Transformer

- Paper About Transformer: https://arxiv.org/pdf/1706.03762.pdf <br>
- The Transformer code is adapted from: https://www.youtube.com/watch?v=U0s0f995w14


### Attention (Scaled Dot-Product Attention)

In [2]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        
        assert (self.head_dim * heads == embed_size), "Embed Size needs to be div by heads"
        
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)
        
    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)
        
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])  #E = QK^T
        #queries shape: (N, query_len, heads, heads_dim), ......
        
        
        if mask is not None:
            energy = energy.masked_fill(mask==0, float("-1e20"))   #-infinity
        
        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)
        
        out = torch.einsum("nhqk,nkhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads*self.head_dim
        )
        #attention shape: (N, heads, query_len, key_len)
        #values shape: (N, value_len, heads, heads_dim)
        # (N, query_len, heads, head_dim) (key_len = query_len)
        out  = self.fc_out(out)
        return out
        
        

### Transformer Block

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, query, mask):
        attention = self.attention.forward(value, key, query, mask)
            
        x = self.dropout(self.norm1(attention+query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward+x))
        return out

### Encoder

In [4]:
class Encoder(nn.Module):
    def __init__(
        self, 
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length    #Avoid Sentences are too long
    ):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        print("embed_size", embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        
        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion
                )
            ]
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N,seq_length).to(self.device)
            
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.layers:
            out = layer(out, out, out, mask)
        
                
        return out

### Decoder

In [5]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion, 
        )
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x,x,x,trg_mask)
        query = self.dropout(self.norm(attention+x))
        out = self.transformer_block(value, key, query, src_mask)
        return out
            

In [6]:
class Decoder(nn.Module):
    def __init__(self,
                 trg_vocab_size,
                 embed_size,
                 num_layers,
                 heads,
                 forward_expansion,
                 dropout,
                 device,
                 max_length
                ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x)+self.position_embedding(positions)))
        
        for layer in self.layers:
            x = layer(x,enc_out, enc_out, src_mask, trg_mask)
            
        out = self.fc_out(x)
        return out

### Transformer (Whole)

In [7]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device='cuda',
        max_length=100
    ):
        super(Transformer, self).__init__()
        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )
        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length
        )
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        #(N,1,1,src_len)
        return src_mask.to(self.device)
    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len,trg_len))).expand(
            N,1,trg_len, trg_len
        )
        return trg_mask.to(self.device)
    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

# True Data

### Import Data

In [8]:
summaries_train_df=pd.read_csv("input/summaries_train.csv")
summaries_test_df=pd.read_csv("input/summaries_test.csv")
prompts_train_df=pd.read_csv("input/prompts_train.csv")
prompts_test_df=pd.read_csv("input/prompts_test.csv")
sample_submission_df=pd.read_csv("input/sample_submission.csv")

In [9]:
import os
import sys
problems_train = prompts_train_df.values.tolist()
train_set = summaries_train_df[['prompt_id','text','content','wording']].values.tolist()
train_set = train_set[:100]   #Now, just use 100 training data, 0: prompt_id, 1: text, 2: content, 3:wording, 4: problems
val_set = train_set[-20:]   #Now, just use 100 training data, 0: prompt_id, 1: text, 2: content, 3:wording, 4: problems
#Append problem in train_set
problems = []
answers = []
scores = []
padding_length = 300
for sheet in train_set:
    for problem in problems_train:
        if sheet[0] == problem[0]:
            sheet.append(problem[1])
            break
    sentence = sheet[4].split(" ")
    if len(sentence) > padding_length:
        print("length", len(sentence))
        print("Senetence",sentence)
        print("Length is bigger then padding_length!!")
        sys.exit(1)
    for _ in range(padding_length-len(sentence)):
        sentence.append("0")
    problems.append(sentence)
    sentence = sheet[1].split(" ")
    if len(sentence) > padding_length:
        print("length", len(sentence))
        print("Sentence", sentence)
        print("Length is bigger then padding_length!")
        sys.exit(1)
    for _ in range(padding_length-len(sentence)):
        sentence.append("0")
    answers.append(sentence)
    scores.append(sheet[2:4])

val_problems = []
val_answers = []
val_scores = []
for sheet in val_set:
    for problem in problems_train:
        if sheet[0] == problem[0]:
            sheet.append(problem[1])
            break
    sentence = sheet[4].split(" ")
    if len(sentence) > padding_length:
        print("length", len(sentence))
        print("Senetence",sentence)
        print("Length is bigger then padding_length!!")
        sys.exit(1)
    for _ in range(padding_length-len(sentence)):
        sentence.append("0")
    val_problems.append(sentence)
    sentence = sheet[1].split(" ")
    if len(sentence) > padding_length:
        print("length", len(sentence))
        print("Sentence", sentence)
        print("Length is bigger then padding_length!")
        sys.exit(1)
    for _ in range(padding_length-len(sentence)):
        sentence.append("0")
    val_answers.append(sentence)
    val_scores.append(sheet[2:4])

"""
print("Problems:",problems)
print("Answers:",answers)
print("Scores:",scores)
"""

"""
print("val Problems:",len(val_problems))
print("val Answers:",len(val_answers))
print("val Scores:",scores)
"""



'\nprint("val Problems:",len(val_problems))\nprint("val Answers:",len(val_answers))\nprint("val Scores:",scores)\n'

In [10]:
from sklearn.preprocessing import LabelEncoder
all_words = sum(problems,[])+sum(answers,[])+sum(val_problems, [])+sum(val_answers, [])

label_encoder = LabelEncoder()
temp = label_encoder.fit_transform(all_words)
max_encoded_value = np.max(temp)
for i in range(len(train_set)):
    problems[i] = label_encoder.transform(problems[i]).tolist()
    answers[i] = label_encoder.transform(answers[i]).tolist()
    
for i in range(len(val_set)):
    val_problems[i] = label_encoder.transform(val_problems[i]).tolist()
    val_answers[i] = label_encoder.transform(val_answers[i]).tolist()



In [11]:
print("Src_vocab_size", np.max(temp)+1)

Src_vocab_size 1808


#### Idea of the Model: 
- Only use encoder: <br>
Problems -> Encoder -> enc_pro
                                  -> Neuron Network -> out <- MSE Loss -> true_scores
Answers -> Encoder -> enc->ans <br>
- Call the total process: Measure

## Measure Class

In [12]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import numpy as np


class Measure(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        word_length,
        src_pad_idx,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device='cuda',
        max_length=500
        ):
        super(Measure, self).__init__()
        self.problem = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )
        self.answer = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )
        self.word_length = word_length
        self.feed_forward = nn.Sequential(
            nn.Linear(self.word_length*(embed_size+embed_size), 2*forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(2*forward_expansion*embed_size, embed_size),
            nn.ReLU(),
            nn.Linear(embed_size, 2)
        )
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        #(N,1,1,src_len)
        return src_mask.to(self.device)
    def forward(self, pro, ans):
        pro_mask = self.make_src_mask(pro)
        ans_mask = self.make_src_mask(ans)
        pro_src = self.problem(pro, pro_mask)
        ans_src = self.answer(ans, ans_mask)
        #ans_src = ans_src.unsqueeze(1)
        
        x = torch.cat((pro_src, ans_src), dim=1)
        x = x.reshape(x.shape[0], x.shape[1]*x.shape[2])
        out = self.feed_forward(x)
        return out

## Config & Start Training

In [13]:
src_vocab_size = max_encoded_value+1
num_layers = 3
src_pad_idx = 0
word_length = padding_length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 2
dropout = 0
embed_size = 256
print("Now using:", device)

Now using: cpu


In [None]:
problems = torch.tensor(problems).to(device)
answers = torch.tensor(answers).to(device)
scores = torch.tensor(scores).to(device)
val_problems = torch.tensor(problems).to(device)
val_answers = torch.tensor(answers).to(device)
val_scores = torch.tensor(scores).to(device)
#problems = torch.tensor([[1,5,6,4,3,9,5,2,0], [1,8,7,3,4,5,6,7,2]]).to(device)
#answers = torch.tensor([[1,7,4,2,3,5,9,2,0],[1,5,6,1,2,4,7,6,2]]).to(device)
#scores = torch.tensor([[0.2, 0.3], [0.1, 0.2]]).to(device)
model = Measure(src_vocab_size,word_length,src_pad_idx,embed_size=embed_size,num_layers=num_layers,device=device, dropout=dropout).to(device)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
for i in range(epochs):
    out = model(problems, answers)
    loss = loss_fn(out, scores)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    print("Loss of training "+str(i)+": ", loss.item())
    with torch.no_grad():
        out = model(val_problems, val_answers)
        val_loss = loss_fn(out, val_scores)
        print("Loss of valid "+str(i)+": ", val_loss.item())



embed_size 256
embed_size 256


  val_problems = torch.tensor(problems).to(device)
  val_answers = torch.tensor(answers).to(device)
  val_scores = torch.tensor(scores).to(device)
