#Data Loading and preprocessing

##Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Unzipping

In [2]:
#!unzip '/content/drive/MyDrive/SMAI Proj/Copy of glove.twitter.27B.zip' -d '/content/drive/MyDrive/SMAI Proj/'

##Loading the glove data

In [3]:
import os
import numpy as np

root_path = '/content/drive/MyDrive/SMAI Proj/'

wordVecLength = 50

embeddings_index = {} #initialize dictionary
f = open(os.path.join(root_path,'glove.twitter.27B.'+str(wordVecLength)+'d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float64')
    embeddings_index[word] = coefs
f.close()

## Loading the json data

In [4]:
import json
f = open('/content/drive/MyDrive/SMAI Proj/Copy of train_data.json')
train_data = json.load(f)
g= open('/content/drive/MyDrive/SMAI Proj/Copy of test_data.json')
test_data = json.load(g)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from string import punctuation


english_articles_train = []
english_articles_test = []


for d in train_data:
  if d['meta_lang_1'] == 'en' and d['meta_lang_2'] == 'en':
    english_articles_train.append(d)

for d in test_data:
  if d['meta_lang_1'] == 'en' and d['meta_lang_2'] == 'en':
    english_articles_test.append(d)


total_train_articles_odd = []
total_train_articles_even = []
total_test_articles_odd = []
total_test_articles_even = []

y_train = []
y_test = []

for eng in english_articles_train:
  total_train_articles_odd.append(eng['text_1'])
  total_train_articles_even.append(eng['text_2'])
  y_train.append(eng['score'])

for eng in english_articles_test:
  total_test_articles_odd.append(eng['text_1'])
  total_test_articles_even.append(eng['text_2'])
  y_test.append(eng['score'])




total_articles= total_train_articles_odd+total_train_articles_even+total_test_articles_odd+total_test_articles_even


y_train = np.array(y_train)


##Tokenizing the articles

In [6]:
import tensorflow as tf
from tensorflow import keras

In [7]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

t = Tokenizer()

t.fit_on_texts(total_articles)
vocab_size = len(t.word_index) + 1

encoded_docs = t.texts_to_sequences(total_articles)

max_length = 500
X = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
#y = np.array(input['label'])
#print(X.shape, y.shape)
print(X.shape)


encoded_docs_train_odd = t.texts_to_sequences(total_train_articles_odd)
encoded_docs_train_even = t.texts_to_sequences(total_train_articles_even)
encoded_docs_test_odd = t.texts_to_sequences(total_test_articles_odd)
encoded_docs_test_even = t.texts_to_sequences(total_test_articles_even)
X1 = pad_sequences(encoded_docs_train_odd, maxlen=max_length, padding='post')
X2 = pad_sequences(encoded_docs_train_even, maxlen=max_length, padding='post')
X3 = pad_sequences(encoded_docs_test_odd, maxlen=max_length, padding='post')
X4 = pad_sequences(encoded_docs_test_even, maxlen=max_length, padding='post')


(3046, 500)


##Making Embedding Matrix

In [8]:
embedding_matrix = np.zeros((vocab_size, wordVecLength))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector
    

embedding_matrix.shape

(62550, 50)

#Model 1

In [9]:
from tensorflow.keras.layers import Embedding, Input, Flatten, LSTM, concatenate, Dense, Bidirectional
from tensorflow.keras.models import Model
import numpy as np


In [10]:
def model1():
    input_1 = Input(shape=(max_length, ))
    input_2 = Input(shape=(max_length, ))
    e = Embedding(vocab_size, wordVecLength, weights=[embedding_matrix], input_length=max_length, trainable=False)

    embedded_input_1 = e(input_1)
    embedded_input_1 = e(input_2)


    #model.add(Bidirectional(LSTM(50, dropout=0.2, return_sequences=True)))
    lstm_1 = LSTM(100, )(embedded_input_1)
    lstm_2 = LSTM(100, )(embedded_input_1)
    merge = concatenate([lstm_1, lstm_2])
    dense = Dense(1)(merge)
    
    model = Model([input_1, input_2], dense)
    model.compile(optimizer='adam', loss='MSE', metrics=['accuracy'])
    print(model.summary())

    return(model)

In [11]:
model = model1()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 500, 50)      3127500     ['input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 100)          60400       ['embedding[1][0]']              
                                                                                                  
 lstm_1 (LSTM)                  (None, 100)          60400       ['embedding[1][0]']              
                                                                                              

In [12]:
history = model.fit([X1,X2], y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
y_pred = model.predict([X3,X4])



In [14]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr
print('The mean absolute error for the model is ', mean_absolute_error(y_test,y_pred))
print('The mean squared error for the model is ', mean_squared_error(y_test,y_pred))
print('The pcc error for testing is',pearsonr(y_test,y_pred))

The mean absolute error for the model is  1.2808837577448053
The mean squared error for the model is  2.2149588626169314
The pcc error for testing is (array([0.17070022454399011], dtype=object), 0.008595485403044879)


In [15]:
y_pred_train = model.predict([X1,X2])
print('The pcc error for train is',pearsonr(y_train,y_pred_train))
print('The mean squared error for the model is ', mean_squared_error(y_train,y_pred_train))

The pcc error for train is (array([0.26253852202953637], dtype=object), 9.907676350948795e-22)
The mean squared error for the model is  1.1286391739899646


#Model 2 Bidirectional LSTM

In [16]:
def model2():
    input_1 = Input(shape=(max_length, ))
    input_2 = Input(shape=(max_length, ))
    e = Embedding(vocab_size, wordVecLength, weights=[embedding_matrix], input_length=max_length, trainable=False)

    embedded_input_1 = e(input_1)
    embedded_input_1 = e(input_2)


    #model.add(Bidirectional(LSTM(50, dropout=0.2, return_sequences=True)))
    lstm_1 = Bidirectional(LSTM(50, ))(embedded_input_1)
    lstm_2 = Bidirectional(LSTM(50, ))(embedded_input_1)
    merge = concatenate([lstm_1+lstm_2, lstm_1-lstm_2])
    dense = Dense(1)(merge)
    
    model = Model([input_1, input_2], dense)
    model.compile(optimizer='adam', loss='MSE', metrics=['accuracy'])
    print(model.summary())

    return(model)

In [17]:
model_ = model2()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 500, 50)      3127500     ['input_4[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 100)          40400       ['embedding_1[1][0]']            
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 100)         40400       ['embedding_1[1][0]']            
 )                                                                                          

In [18]:
history2 = model_.fit([X1,X2], y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y_pred2 = model_.predict([X3,X4])



In [20]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr
print('The mean absolute error for the model is ', mean_absolute_error(y_test,y_pred2))
print('The mean squared error for the model is ', mean_squared_error(y_test,y_pred2))
print('The pcc for testing is',pearsonr(y_test,y_pred2))

The mean absolute error for the model is  1.1416363882816445
The mean squared error for the model is  1.8314647218432494
The pcc for testing is (array([0.24719828496463683], dtype=object), 0.00012450150781126814)


In [21]:
y_pred2_train = model_.predict([X1,X2])
print('The pcc error for train is',pearsonr(y_train,y_pred2_train))

The pcc error for train is (array([0.600131519959217], dtype=object), 9.340444951306048e-127)
