In [275]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
from sklearn.svm import SVR

In [276]:
glove_vectors_file = "/Users/chriswoo/Desktop/640/project 2/glove.6B/glove.6B.50d.txt"

we can load glove_vectors into memory, deserializing the space separated format into a Python dictionary

In [277]:
glove_wordmap = {}
with open(glove_vectors_file, "r", errors='ignore') as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

In [278]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
import pandas as pd
from nltk.wsd import lesk

 
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None

def find_contradiction(sentence1,sentence2):
    c = 0
    for s1 in sentence1:
        for s2 in sentence2: 
            if (((s1 == "not") or (s1=="no") or (s1 == "none")) or ((s2 == "not") or (s2=="no") or (s2 == "none"))):
                #print("i have conjunction",s1)
                c = c+1
                #print(c)
                return c
 
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    contradiction=find_contradiction(word_tokenize(sentence1),word_tokenize(sentence2))
    #print("contradiction is", contradiction)
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
    #print(sentence1)
    #print(sentence2)
   
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    #print(synsets1)
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
    #print(synsets2)
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    #print(synsets1)
    synsets2 = [ss for ss in synsets2 if ss]
    #print(synsets2)
    score, count = 0.0, 0
    s_list = []
    count_none = 0
    for synset in synsets1:
         #print(synset)
         best = [synset.wup_similarity(ss) for ss in synsets2]
         #print(best)
         b = pd.Series(best).max()
         s_list.append(b)
    #print("similarity score is", s_list)
    scorelist = []
    for s in s_list:
       #print(s)
       if s <= 1.0:
           count_none = count_none + 1
           scorelist.append(count_none)
           #print("number of non none's are:", count_none)
           #print("number of nons are:", (len(s_list)-count_none))
           
    #print("Total number of matches with less than or equal to 1 similarity:", max(scorelist))
    #print("Total number of nones:", (len(s_list)-max(scorelist)))
    #print(sum_list(s_list))
    if contradiction == 1:
        score = sum_list(s_list)/max(scorelist) - 1
        #print("score for contradction is",score)
    else:
        score = sum_list(s_list)/max(scorelist)
        #print("score for neutral/entailment",score)
    return score

def sum_list(l):
    sum = 0
    for x in l:
        if x<= 1.0:
             sum += x
    return sum


 
#for sentence in sentences:
 #   print ("Similarity(\"%s\", \"%s\") = %s" % (focus_sentence, sentence, sentence_similarity(focus_sentence, sentence)))
    #print ("Similarity(\"%s\", \"%s\") = %s" % (sentence, focus_sentence, sentence_similarity(sentence, focus_sentence)))
    #print 



#is_anagram("The kids are playing outdoors near a man with a smile", "The young boys are playing outdoors and the man is smiling nearby")


In [279]:
#Constants setup
max_hypothesis_length, max_evidence_length = 30, 30
batch_size, vector_size, hidden_size = 128, 50, 200 #INCREASED HIDDEN_SIZE FROM 64 TO 128

lstm_size = hidden_size

weight_decay = 0.001 # CHANGED FROM 0.0001

learning_rate = 1.5

input_p, output_p = 1.0, 1.0

training_iterations_count = 120000

display_step = 10

In [280]:
def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    return res

In [281]:
def sentence2sequence(sentence):
    tokens = sentence.lower().split(" ")
    rows = []
    words = []
    #Greedy search for tokens
    for token in tokens:
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            #print("hello")
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                i = i-1
    return rows, words


In [282]:
def split_data_into_scores():

    with open("/Users/chriswoo/Desktop/640/project 2/train.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            labels.append(row["relatedness_score"])
            #scores.append(score_setup(row,labels))
            #print(labels)
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), labels, scores #, np.array(scores)
 
data_feature_list, correct_labels, correct_score = split_data_into_scores()

In [283]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import numpy as np

#x = np.arange(-2,3.0,0.01)
#y = x**2 - 2*x + 1

correct_score1 = np.array(correct_score,dtype=float)
correct_labels1 = np.array(correct_labels,dtype=float)

model = Sequential()
model.add(Dense(50, activation='sigmoid', 
                input_dim=1, init='uniform'))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='mean_squared_error', 
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(correct_score1,correct_labels1,nb_epoch=300, batch_size = 5,verbose = 0)

#print(correct_score)

  


<keras.callbacks.History at 0x1356d32b0>

In [284]:
def split_data_into_scores():

    with open("/Users/chriswoo/Desktop/640/project 2/trial.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            labels.append(row["relatedness_score"])
            #scores.append(score_setup(row,labels))
            #print(labels)
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), labels, scores #, np.array(scores)
 
data_feature_list1, correct_labels1, correct_score1 = split_data_into_scores()

In [285]:
correct_score1=np.array(correct_score1)
predict_relatedness1 = []
for i in range(0,500):
    predict_relatedness1.append(model.predict(np.asarray(correct_score1[i]).reshape(1,1)))

In [286]:
trial_label=[]
for i in correct_labels1:
    trial_label.append(float(i))

In [287]:
trial_prediction=[]
for i in predict_relatedness1:
    trial_prediction.append(float(i))

In [288]:
pearsonr(trial_label,trial_prediction)

(0.5710534252297933, 1.3019645023657048e-44)

In [289]:
correct_score= np.array(correct_score).reshape(-1, 1)
correct_labels = np.array(correct_labels).reshape(-1, 1)

# LINEAR REGRESSION

In [290]:
from sklearn import linear_model

reg = linear_model.Lasso(alpha = 0.1)
reg.fit(correct_score,correct_labels)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [294]:
correct_score1= np.array(correct_score1).reshape(-1, 1)

In [295]:
predict_relatedness2=reg.predict(correct_score1)

In [296]:
pearsonr(predict_relatedness2,trial_label)

(0.27415953030123663, 4.5204843739750397e-10)

In [298]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
import numpy as np

#x = np.arange(-2,3.0,0.01)
#y = x**2 - 2*x + 1

correct_score1 = np.array(correct_score,dtype=float)
correct_labels1 = np.array(correct_labels,dtype=float)

model = Sequential()
model.add(Dense(50, activation='sigmoid', 
                input_dim=1, init='uniform'))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=False)
model.compile(loss='mean_squared_error', 
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(correct_score1,correct_labels1,nb_epoch=300, batch_size = 5,verbose = 0)

#print(correct_score)

'''def predict_relatedness(score, labels):
   
    svr_len = SVR(kernel= 'linear', C=1e3)
    svr_len.fit(score,labels)
    return svr_len.predict'''

  


"def predict_relatedness(score, labels):\n   \n    svr_len = SVR(kernel= 'linear', C=1e3)\n    svr_len.fit(score,labels)\n    return svr_len.predict"

In [299]:
def split_data_into_scores1():

    with open("/Users/chriswoo/Desktop/640/project 2/test.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        pair_id = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            #print(scores)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            pair_id.append(row["pair_ID"])
            #labels.append(row["relatedness_score"])
            #scores.append(score_setup(row,labels))
            #print(labels)
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), scores, pair_id #, np.array(scores)

In [300]:
data_feature_listt, correct_scoret,paid_idt = split_data_into_scores1()

In [301]:
correct_scoret=np.array(correct_scoret)
predict_relatedness = []
for i in range(0,4927):
    predict_relatedness.append(model.predict(np.asarray(correct_scoret[i]).reshape(1,1)))

In [302]:
output_DT = pd.DataFrame(data={"pair_ID":paid_idt,"Relatedness_score":predict_relatedness})
output_DT.to_csv("/Users/chriswoo/Desktop/640/project 2/relatedness.csv",index=False,quoting=3)

In [303]:
judgement=pd.read_csv('/Users/chriswoo/Desktop/640/project 2/judgment.csv')
relatedness=pd.read_csv('/Users/chriswoo/Desktop/640/project 2/relatedness.csv')

In [304]:
final=judgement

In [305]:
final['Relatedness_score']=relatedness['Relatedness_score']

In [306]:
final= final[['pair_ID','entailment_judgment','Relatedness_score']]

In [307]:
final.to_csv('/Users/chriswoo/Desktop/640/project 2/result.txt',index=False)