#Anything Goes Implementation

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow import keras
import os
import re
import random
import io
from math import log2
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import pickle

In [3]:
def encod_decod(data):
  inputlen=10
  vocab = set([c for a in data[0] for c in a])
  vocab.add('<PAD>')
  encoder = dict((c,i) for i,c in enumerate(vocab))
  decoder = dict((i,c) for i,c in enumerate(vocab))
  return inputlen,vocab,encoder,decoder

In [4]:
def inp_out(vocab,encoder,decoder):
  X = []
  y = []
  inputlen = 10
  for a in data[0]:
      Xenc = [encoder['<PAD>']]*inputlen
      for c in a:
          X.append(Xenc.copy())
          y.append(encoder[c])
          Xenc.pop(0)
          Xenc.append(encoder[c])
      X.append(Xenc.copy())
      y.append(encoder['<PAD>'])
      
  X = np.array(X)
  y = to_categorical(y, num_classes=len(vocab))
  return X,y

In [5]:
def split_function(X,y):
  X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1, random_state = 42)
  return X_train,X_dev,y_train,y_dev

In [6]:
def model(vocab,inputlen,X_train,y_train,X_dev,y_dev):
  emb_dim = 10
  model = Sequential()
  model.add(Embedding(input_dim=len(vocab), output_dim=emb_dim, input_length=inputlen))
  model.add(Flatten())
  model.add(Dense(40))
  model.add(Dense(len(vocab), activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  model.fit(X_train, y_train, batch_size=16384, epochs=50, validation_data=(X_dev, y_dev),verbose=2)
  return model

In [7]:
def anything_goes_model(char,model,encoder):
  inputlen=10
  Xout = [encoder['<PAD>']]*inputlen
  Xout.pop(0)
  Xout.append(encoder[char])
  preds = model.predict(np.array([Xout]), verbose=0)
  # print("Probabilities sum at each step: ",sum(preds[0]))

  max_prob = max(preds[0])
  # print(max_prob)
  return max_prob



In [8]:
def evaluate_one(lang,model,encoder):
  testfile = open(lang+'-test.txt', 'r')
  max_history = 100
  history = []
  loss_anything_goes = 0
  #loss_from_scratch = 0
  count = 0
  while True:
    c = testfile.read(1)
    #print(c)
    if not c:
      break
    count += 1
    loss_anything_goes -= log2(anything_goes_model(c, model,encoder))
    #loss_from_scratch -= log2(from_scratch_model(lang, c, history))
    if len(history) == max_history:
      history.pop(0)
    history.append(c)
    #print(loss_anything_goes)
  return [loss_anything_goes/count]

#kwere Execution

In [21]:
data=open("cwe-train.txt",'r',encoding = "UTF-8").readlines()
inputlen,vocab,encoder,decoder=encod_decod(data)
X,y=inp_out(vocab,encoder,decoder)
X_train, X_dev, y_train, y_dev=split_function(X,y)
model=model(vocab,inputlen,X_train,y_train,X_dev,y_dev)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 10)            330       
                                                                 
 flatten_1 (Flatten)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 40)                4040      
                                                                 
 dense_3 (Dense)             (None, 33)                1353      
                                                                 
Total params: 5,723
Trainable params: 5,723
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
67/67 - 1s - loss: 2.7144 - accuracy: 0.4584 - val_loss: 2.0019 - val_accuracy: 0.5008 - 969ms/epoch - 14ms/step
Epoch 2/50
67/67 - 0s - loss: 1.8133 - accuracy: 0.4999 - va

In [None]:
kwere_results=evaluate_one('cwe',model,encoder)

Save Model

In [35]:
with open('/content/drive/MyDrive/Colab Notebooks/kwere_results_final', 'wb') as file:
  pickle.dump(kwere_results, file)

Load Model

In [25]:
with open('/content/drive/MyDrive/Colab Notebooks/kwere_results_final','rb') as file:
    kwere_results = pickle.load(file)

Cross Entropy Loss

In [26]:
print("CWE Cross Entropy Loss: ",kwere_results)

CWE Cross Entropy Loss:  [3.5078298424387603e-05]


#Swahili Execution

In [None]:
data=open("sw-train.txt",'r',encoding = "UTF-8").readlines()
inputlen,vocab,encoder,decoder=encod_decod(data)
X,y=inp_out(vocab,encoder,decoder)
X_train, X_dev, y_train, y_dev=split_function(X,y)
sw_model=model(vocab,inputlen,X_train,y_train,X_dev,y_dev)

In [None]:
swahili_results=evaluate_one('sw',sw_model,encoder)

Save Model

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/swahili_results_final', 'wb') as file:
  pickle.dump(swahili_results, file)

Load Model

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/swahili_results_final','rb') as file:
    swahili_results = pickle.load(file)

Cross Entropy Loss

In [None]:
print("CWE Cross Entropy Loss: ",swahili_results)