Anything Goes Implementation

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow import keras
import os
import re
import random
import io
from math import log2
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
import pickle

In [2]:
def input_data(path):
  data=open(path,'r',encoding = "UTF-8").readlines()
  chars = sorted(list(set(data[0])))
  print("Total disctinct chars:", len(chars))
  return data,chars

In [3]:
def inp_out_structure(data):
  maxlen = 30
  step = 2
  input = []
  output = []
  for i in range(0, len(data[0]) - maxlen, step):
      input.append(data[0][i : i + maxlen])
      output.append(data[0][i + maxlen])
  print("Length of sequences:", len(input))
  return input,output,maxlen

In [4]:
def show_input_output_seq(input,output):
  print("Input characters along with their original next character\n")
  for i in range(5):
    print( input[i]," ", output[i])

In [5]:
def encod_decod(data):
  char_indices={}
  indices_char={}
  for i,c in enumerate(chars):
    indices_char[i]=c
  for i,c in enumerate(chars):
    char_indices[c]=i
  return char_indices,indices_char

In [6]:
def inp_out(chars,char_indices,output,maxlen):
  x = np.zeros((len(input), maxlen, len(chars)), dtype=np.bool)
  y = np.zeros((len(input), len(chars)), dtype=np.bool)
  for i, s in enumerate(input):
      for t, char in enumerate(s):
          x[i, t, char_indices[char]] = 1
      y[i, char_indices[output[i]]] = 1
  return x,y


In [7]:
def LSTM_Model(maxlen,chars,x,y):
  model = keras.Sequential(
      [
          keras.Input(shape=(maxlen, len(chars))),
          layers.LSTM(128),
          layers.Dense(len(chars), activation="softmax"),
      ]
  )
  optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=['accuracy'])
  model.fit(x, y, batch_size=128, epochs=1)
  return model

In [15]:
def anything_goes_model(sentence,c,char_indices,chars,model,maxlen):
  #print("sentence collection: ",sentence[-1])
  x_pred = np.zeros((1, maxlen, len(chars)))
  for t, char in enumerate(sentence[:-1]):             #sentence is the fixed history of 30 characters from training set
                                                       #excluding the last character from the sentence because that character is the character passed from test set which i am passing as a history.   
      x_pred[0, t, char_indices[char]] = 1.0
  preds = model.predict(x_pred, verbose=0)[0]
  #print(preds[char_indices[sentence[-1]]])
  return preds[char_indices[c]]       #chosing the probability of the word passed from test set

In [16]:
def evaluate_one(lang,model,char_indices,chars,maxlen):
  testfile = open(lang+'-test.txt', 'r')
  max_history = 30
  history=''
  loss_anything_goes = 0
  count = 0
  while True:
    c = testfile.read(1)
    if not c:
      break
    count += 1
    history+=(c)              #Appending the character to history string to maintain a history of max 30 characters
    #print(history)
    loss_anything_goes -= log2(anything_goes_model(history,c,char_indices,chars,model,maxlen))
    if len(history) == max_history:        #Upon reaching max history limit I would skip the first character
      history=history[1:]
    #history+=c
    #print(loss_anything_goes/count)
  return loss_anything_goes/count

#kwere Execution

In [10]:
path="/content/drive/MyDrive/Colab Notebooks/DeepLearning/cwe-train.txt"
data,chars=input_data(path)
input,output,maxlen=inp_out_structure(data)
show_input_output_seq(input,output)
char_indices,indices_char=encod_decod(data)
x,y=inp_out(chars,char_indices,output,maxlen)
model=LSTM_Model(maxlen,chars,x,y)

Total disctinct chars: 32
Length of sequences: 301701
Input characters along with their original next character

chikale vinogile fana viya wan   h
ikale vinogile fana viya wanhu    
ale vinogile fana viya wanhu w   o
e vinogile fana viya wanhu woc   h
vinogile fana viya wanhu wochi   k


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until




In [17]:
kwere_results=evaluate_one('cwe',model,char_indices,chars,maxlen)

In [18]:
kwere_results

6.585875142796728

In [19]:
with open('/content/drive/MyDrive/Colab Notebooks/kwere_results', 'wb') as file:
  pickle.dump(kwere_results, file)

In [20]:
with open('/content/drive/MyDrive/Colab Notebooks/kwere_results','rb') as file:
    kwere_results = pickle.load(file)

#Swahili Execution

In [None]:
path="/content/drive/MyDrive/Colab Notebooks/DeepLearning/sw-train.txt"
data,chars=input_data(path)
input,output,maxlen=inp_out_structure(data)
show_input_output_seq(input,output)
char_indices,indices_char=encod_decod(data)
x,y=inp_out(chars,char_indices,output,maxlen)
model=LSTM_Model(maxlen,chars,x,y)

In [None]:
swahili_results=evaluate_one('sw',model,char_indices,chars,maxlen)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/swahili_results', 'wb') as file:
  pickle.dump(swahili_results, file)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/swahili_results','rb') as file:
    swahili_results = pickle.load(file)