In [62]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping

import matplotlib.pyplot as plt

In [31]:
def get_vocab(path):
    vocab = []
    with open(path, 'r', encoding='utf-8') as file:
        for word in file:
            vocab.append(re.sub("\n", "", word))
    file.close()
    return vocab

In [41]:
def get_embedding(vocab, path):
    embedding_dict = dict()
    with open(path, 'r', encoding='utf-8') as file:
        for row in file:
            row = row.split()
            if row[0] in vocab and len(row[1:]) == 300:
                embedding_dict[row[0]] = np.asarray(row[1:], dtype='float32')
    file.close()
    return embedding_dict

In [46]:
def prepare_data(vocab, embedding_dict1, embedding_dict2):
    x_train = []
    y_train = []
    for word in vocab:
        x_train.append(embedding_dict1.get(word))
        y_train.append(embedding_dict2.get(word))
    return x_train, y_train

In [66]:
def write_to_file(path, embedding_matrix, word_list):
    file = open(path, "w")
    for i in range(len(word_list)):
        wv_string = ""
        for vi in embedding_matrix[i]:
            wv_string = wv_string + " " + str(vi)
        wv_string = word_list[i] + " " + wv_string + "\n"
        file.write(wv_string)
    file.close()

In [33]:
train_vocab = get_vocab("vocabulary/ds-indic-intersect.txt")
print(len(train_vocab))

30691


In [34]:
predict_vocab = get_vocab("vocabulary/indic-minus-ds.txt")
print(len(predict_vocab))

227723


In [43]:
ds_embedding_dict = get_embedding(train_vocab, "/home/eastwind/word-embeddings/fasttext/TechDofication.mr.raw.complete.ft.skipgram.new.d300.vec")
len(ds_embedding_dict)

30691

In [44]:
indic_embedding_dict = get_embedding(train_vocab, "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec")
len(indic_embedding_dict)

30691

In [82]:
x_train_embedding, y_train_embedding = prepare_data(train_vocab, indic_embedding_dict, ds_embedding_dict)

In [83]:
x_train_embedding = np.asarray(x_train_embedding, dtype='float32')
y_train_embedding = np.asarray(y_train_embedding, dtype='float32')
print(x_train_embedding.shape)
print(y_train_embedding.shape)

(30691, 300)
(30691, 300)


In [69]:
write_to_file("vocabulary/indic-ds-intersection-indic-embeddings.vec", x_train, train_vocab)

In [70]:
write_to_file("vocabulary/indic-ds-intersection-ds-embeddings.vec", y_train, train_vocab)

In [85]:
x_train, x_val, y_train, y_val = train_test_split(x_train_embedding, 
                                                  y_train_embedding, 
                                                  test_size=0.2, 
                                                  shuffle=True)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(24552, 300)
(24552, 300)
(6139, 300)
(6139, 300)


In [89]:
src = Input(shape=(300,))
Dense1 = Dense(300, activation='relu')(src)
out = Dense(300, activation='tanh')(Dense1)
regressor = Model(src, out)
regressor.compile(optimizer='adam', loss='mse', metrics=['acc'])
regressor.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
dense_2 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_3 (Dense)              (None, 300)               90300     
Total params: 180,600
Trainable params: 180,600
Non-trainable params: 0
_________________________________________________________________


In [90]:
# ModelCheckPoint Callback:
checkpoint_filepath = "../models/regressor/regressor-epoch-{epoch:02d}-val-acc-{val_acc:02f}.h5"
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,
                                            save_weights_only=True,
                                            monitor='val_acc',
                                            mode='max',
                                            save_freq = 'epoch',
                                            save_best_only=True)

# Reduce Learning Rate on Plateau Callback:
reduce_lr_callback = ReduceLROnPlateau( monitor='val_loss', 
                                       factor=0.1, 
                                       patience=2, 
                                       min_lr=0.0005, 
                                       verbose=2)
#myCB = myCallbacks(metrics='acc', threshold=0.97)

In [91]:
history = regressor.fit(x_train,
                        y_train,
                        epochs=10,
                        batch_size=256,
                        validation_data=(x_val, y_val),
                        callbacks=[model_checkpoint_callback, reduce_lr_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

In [100]:
regressor_model = LinearRegression().fit(x_train, y_train)

In [105]:
def classification_report(y_test, results):
    acc = accuracy_score(y_test, results)
    precision = precision_score(y_test, results, average=None)
    recall = recall_score(y_test, results, average=None)
    f1 = f1_score(y_test, results, average=None)
    return acc, precision, recall, f1

In [None]:
acc, precision, recall, f1 = classification_report(y_val, results)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average Recall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))