In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import dd
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix

In [2]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
import pandas as pd
from nltk.wsd import lesk

 
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None

def find_contradiction(sentence1,sentence2):
    c = 0
    for s1 in sentence1:
        for s2 in sentence2: 
            if (((s1 == "not") or (s1=="no") or (s1 == "none")) or ((s2 == "not") or (s2=="no") or (s2 == "none"))):
                #print("i have conjunction",s1)
                c = c+1
                #print(c)
                return c
 
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    contradiction=find_contradiction(word_tokenize(sentence1),word_tokenize(sentence2))
    #print("contradiction is", contradiction)
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
    #print(sentence1)
    #print(sentence2)
   
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    #print(synsets1)
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
    #print(synsets2)
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    #print(synsets1)
    synsets2 = [ss for ss in synsets2 if ss]
    #print(synsets2)
    score, count = 0.0, 0
    s_list = []
    count_none = 0
    for synset in synsets1:
         #print(synset)
         best = [synset.wup_similarity(ss) for ss in synsets2]
         #print(best)
         b = pd.Series(best).max()
         s_list.append(b)
    #print("similarity score is", s_list)
    scorelist = []
    for s in s_list:
       #print(s)
       if s <= 1.0:
           count_none = count_none + 1
           scorelist.append(count_none)
           #print("number of non none's are:", count_none)
           #print("number of nons are:", (len(s_list)-count_none))
           
    #print("Total number of matches with less than or equal to 1 similarity:", max(scorelist))
    #print("Total number of nones:", (len(s_list)-max(scorelist)))
    #print(sum_list(s_list))
    if contradiction == 1:
        score = sum_list(s_list)/max(scorelist) - 1
        #print("score for contradction is",score)
    else:
        score = sum_list(s_list)/max(scorelist)
        #print("score for neutral/entailment",score)
    return score

def sum_list(l):
    sum = 0
    for x in l:
        if x<= 1.0:
             sum += x
    return sum


 
#for sentence in sentences:
 #   print ("Similarity(\"%s\", \"%s\") = %s" % (focus_sentence, sentence, sentence_similarity(focus_sentence, sentence)))
    #print ("Similarity(\"%s\", \"%s\") = %s" % (sentence, focus_sentence, sentence_similarity(sentence, focus_sentence)))
    #print 



#is_anagram("The kids are playing outdoors near a man with a smile", "The young boys are playing outdoors and the man is smiling nearby")


In [3]:
glove_vectors_file = "/Users/zoec/Documents/cus 640/PROJECT2/glove.6B.50d.txt"

glove_wordmap = {}
with open(glove_vectors_file, "r", errors='ignore') as glove:
    for line in glove:
        #print("hello")
        #print(line)
        name, vector = tuple(line.split(" ", 1))
        #print(name,vector)
        glove_wordmap[name] = np.fromstring(vector, sep=" ")
        #print(glove_wordmap)

In [4]:
#Constants setup
max_hypothesis_length, max_evidence_length = 30, 30
vector_size = 50#INCREASED HIDDEN_SIZE FROM 64 TO 128

def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    return res

In [5]:
def sentence2sequence(sentence):
    tokens = sentence.lower().split(" ")
    rows = []
    words = []
    #Greedy search for tokens
    for token in tokens:
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            #print("hello")
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                i = i-1
    return rows, words


In [29]:
def split_data_into_scores():

    with open("/Users/zoec/Documents/cus 640/PROJECT2/SICK_train.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        #labels1 = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            labels.append(row["entailment_judgment"])
                        
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), labels, scores #, np.array(scores)
 
data_feature_list, correct_labels, correct_score = split_data_into_scores()


  


In [30]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

In [31]:
X_train = correct_score
y_train= correct_labels

In [32]:
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y_train)

In [33]:
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=1, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [34]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

kfold = KFold(n_splits=10, shuffle=True)
X1 = np.array(X_train,dtype=float)

results = cross_val_score(estimator, X1, dummy_y, cv=kfold)
estimator.fit(X1,dummy_y)

print("Accuracy: %.2f%% " % (results.mean()*100))

Accuracy: 74.07% 


In [35]:
def split_data_into_scores():

    with open("/Users/zoec/Documents/cus 640/PROJECT2/SICK_trial.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        #labels1 = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            labels.append(row["entailment_judgment"])
                        
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), labels, scores #, np.array(scores)
 
data_feature_list1, correct_labels1, correct_score1 = split_data_into_scores()

  


In [36]:
X_test=correct_score1
y_test=correct_labels1

In [37]:
encoder = LabelEncoder()
encoder.fit(y_test)
encoded_y_test = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y_test)

In [38]:
estimator.score(X_test,encoded_y_test)

0.7220000103116035

In [39]:
y_pred=estimator.predict(X_test)

In [41]:
encoder = LabelEncoder()
encoder.fit(y_pred)
encoded_Y1 = encoder.transform(y_pred)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_predict = np_utils.to_categorical(encoded_Y1)

# Random Forest

In [46]:
X_train= np.array(X_train).reshape(-1, 1)
encoded_y_train = np.array(encoded_y_train).reshape(-1, 1)

In [47]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(X_train ,encoded_y_train )

  """


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
clf.score(X_train,encoded_y_train )

0.8406666666666667

In [49]:
X_test= np.array(X_test).reshape(-1, 1)
encoded_y_test = np.array(encoded_y_test).reshape(-1, 1)

In [50]:
encoded_y_test= np.array(encoded_y_test).reshape(-1, 1)

In [51]:
clf.score(X_test,encoded_y_test)

0.72

# svm

In [52]:
X_train= np.array(X_train).reshape(-1, 1)
encoded_y_train = np.array(encoded_y_train).reshape(-1, 1)
from sklearn import svm

svm_clf = svm.SVC (kernel= 'rbf',C = 1)
svm_clf.fit(X_train ,encoded_y_train)
result=svm_clf.score(X_train,encoded_y_train)
print("The accuracy of training set in SVM is %s " %(result))

  y = column_or_1d(y, warn=True)


The accuracy of training set in SVM is 0.7386666666666667 


In [53]:
#######################

glove_vectors_file1 = "/Users/zoec/Documents/cus 640/PROJECT2/glove.6B.50d.txt"

glove_wordmap1 = {}
with open(glove_vectors_file1, "r", errors='ignore') as glove:
    for line in glove:
        #print("hello")
        #print(line)
        name, vector = tuple(line.split(" ", 1))
        #print(name,vector)
        glove_wordmap1[name] = np.fromstring(vector, sep=" ")
        #print(glove_wordmap)

In [54]:
def sentence2sequence1(sentence):
    tokens = sentence.lower().split(" ")
    rows = []
    words = []
    #Greedy search for tokens
    for token in tokens:
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            #print("hello")
            if word in glove_wordmap1:
                rows.append(glove_wordmap1[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                i = i-1
    return rows, words

In [55]:
def split_data_into_scores1():

    with open("/Users/zoec/Documents/cus 640/PROJECT2/SICK_test.txt","r") as data:
        train = csv.DictReader(data, delimiter='\t')
        evi_sentences = []
        hyp_sentences = []
        labels = []
        scores = []
        pair_id = []
        for row in train:
            #print(row["sentence1"])
            focus_sentence = (row["sentence_A"].lower())
            sentences = (row["sentence_B"].lower())
            sc=sentence_similarity(focus_sentence,sentences)
            #print(sc)
            scores.append(sc)
            #print(scores)
            hyp_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_A"].lower())[0]))
            evi_sentences.append(np.vstack(
                    sentence2sequence(row["sentence_B"].lower())[0]))
            pair_id.append(row["pair_ID"])
            #labels.append(row["relatedness_score"])
            #scores.append(score_setup(row,labels))
            #print(labels)
        hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size))
                          for x in hyp_sentences])
        evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size))
                      for x in evi_sentences])
                             
    return (hyp_sentences, evi_sentences), scores, pair_id #, np.array(scores)


In [56]:
data_feature_listt, correct_scoret,paid_idt = split_data_into_scores1()

  


In [57]:
correct_scoret=np.array(correct_scoret)

In [58]:
predictions = estimator.predict(correct_scoret)
#predict_classification = []
prediction_encoder = encoder.inverse_transform(predictions)

In [59]:
output_DT = pd.DataFrame(data={"pair_ID":paid_idt,"entailment_judgment":prediction_encoder})
output_DT.to_csv("/Users/zoec/Documents/cus 640/PROJECT2/judgment.csv",index=False,quoting=3)