In [275]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
from gensim.sklearn_api import W2VTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import numpy as np
import math
from collections import Counter

In [276]:
train_data = pd.read_csv('./data_release/train.csv', encoding='latin-1')

### POS Seq Transforming

In [125]:
pos_seqs_list_of_lists = [pos_seq.split() for pos_seq in train_data['pos_seq']]

pos_seqs = []
for pos_seq in train_data['pos_seq']:
    pos_seqs += pos_seq.split()

In [126]:
pos_model = W2VTransformer(size=1, min_count=1, seed=1)
# What is the vector representation of the word 'graph'?
posvecs = pos_model.fit(pos_seqs_list_of_lists).transform(pos_seqs)
posvecs = np.array(posvecs)
print(posvecs.shape)

(116622, 1)


### Word token transforming

In [127]:
word_token_list_of_lists = [sentence.lower().split() for sentence in train_data['sentence']]

word_tokens = []
for sentence in train_data['sentence']:
    word_tokens += sentence.lower().split()

In [128]:
word_model = W2VTransformer(size=10, min_count=1, seed=1)
# What is the vector representation of the word 'graph'?
wordvecs = word_model.fit(word_token_list_of_lists).transform(word_tokens)
wordvecs = np.array(wordvecs)
print(wordvecs.shape)

(116622, 10)


In [129]:
vecs = np.concatenate((wordvecs, posvecs), axis=1)
vecs.shape

(116622, 11)

In [130]:
label_seqs = []
for label_seq in train_data['label_seq']:
    label_seqs += ast.literal_eval(label_seq)

In [131]:
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(vecs, label_seqs)

In [132]:
val_data = pd.read_csv('./data_release/val.csv', encoding='latin-1')

In [133]:
x  = clf.predict_log_proba(vecs)
y = clf.predict(vecs)

In [134]:
vecs[i]

NameError: name 'i' is not defined

In [321]:
class MaxEnt_Metaphor_Tagger():
    def __init__(self, train_data, size):
        '''
        train_data: dataframe of word features where the first column is the sentence, second column
            is a string of a list of POS_Sequences, and third columnn is a string of a list of metaphor 
            label sequences (0: not metaphor, 1: metaphor)
        '''
        self.size = size
        
        # Transform POS_Seq
        self.pos_seqs_list_of_lists = [ast.literal_eval(pos_seq) for pos_seq in train_data['pos_seq']]
        self.pos_seqs = []
        for pos_seq in train_data['pos_seq']:
            self.pos_seqs += ast.literal_eval(pos_seq)
        self.pos_model = W2VTransformer(size=1, min_count=1, seed=1)
        self.posvecs = np.array(self.pos_model.fit(self.pos_seqs_list_of_lists).transform(self.pos_seqs))
        
#         print('pos_Seq', self.pos_seqs)
        
        # Transform Word Tokens
        self.word_token_list_of_lists = [sentence.lower().split() for sentence in train_data['sentence']]
        self.word_tokens = []
        self.sample_indices = []
        for i, sentence in enumerate(train_data['sentence']):
            self.word_tokens += sentence.lower().split()
            self.sample_indices += [i for x in range(len(sentence))]
        
        self.word_model = W2VTransformer(size=self.size, min_count=1, seed=1)
        self.wordvecs = np.array(self.word_model.fit(self.word_token_list_of_lists).transform(self.word_tokens))
        
        #TF-IDF 
#         self.tfidf_vectorizer = TfidfVectorizer()
#         self.tfidf_vecs = self.tfidf_vectorizer.fit_transform([sentence.lower() for sentence in train_data['sentence']])
#         print('tfidf shape:', self.tfidf_vecs.shape)
#         print(self.tfidf_vecs[0])
#         print('feature names:', self.tfidf_vectorizer.get_feature_names())
        
        # Concatenate Feature Vectors
        self.vecs = np.concatenate((self.wordvecs, self.posvecs), axis=1)
        
#         for i, word_vec in enumerate(self.vecs):
#             word = self.word_tokens[i]
#             sentence_index = self.sample_indices[i]
#             word_index = self.tfidf_vectorizer.get_feature_names().index(word)
#             print('sentence index:', sentence_index)
#             print('word index:', word_index)
#             print(type(self.tfidf_vecs))
#             tfidf = self.tfidf_vecs[sentence_index][word_index]
#             word_vec += tfidf
        
        
        # Create Target Vector
        self.label_seqs = []
        for label_seq in train_data['label_seq']:
            self.label_seqs += ast.literal_eval(label_seq)
        
        # Train MaxEnt classifier
        self.classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').\
            fit(self.vecs, self.label_seqs)
        
        # Get tag bigrams for transition probabilities
        self.tag_counts = Counter()
        self.tag_bigrams = {}
        for row in train_data.iterrows():
            
            #preprocess: add start characters and labels for computing initial probabilities
            # and convert strings to lists and downcase sentences
            tags_string = row[1][2]
            tags = ast.literal_eval(tags_string)
            tags.insert(0, '<START>')
            
            #get label bigram counts -- (0,0), (0,1), (1,0), (1,1), ('<START>',0), ('<START>',1)
            for t in range(1, len(tags)):
                tag_bigram = (tags[t-1], tags[t])
                if tag_bigram not in self.tag_bigrams:
                    self.tag_bigrams[tag_bigram] = 1
                else:
                    self.tag_bigrams[tag_bigram] += 1
                    
            #get individual tag counts
            self.tag_counts.update(tags)
    
    def transform_sentence(self, sentence, pos_sequence):
        wordvecs = np.zeros(shape=(len(sentence.split()), self.size))
        posvecs = np.zeros(shape=(len(pos_sequence), 1))
        
        for i, word in enumerate(sentence.lower().split()):
            if word in self.word_tokens:
                wordvecs[i] = self.word_model.transform(word)
            else:
                wordvecs[i] = np.array([None for i in range(0, self.size)])
#                 print(wordvecs[i])
            posvecs[i] = self.pos_model.transform(pos_sequence[i])
        
        
        return np.concatenate((wordvecs, posvecs), axis=1)
    
    def predict_log_proba(self, vectors):
        return self.classifier.predict_log_proba(vectors)
    
    def predict(self, vectors):
        return self.classifier.predict(vectors)
    
    def viterbi(self, feature_vectors):
        previous_log_scores = []
        backpointers = []
        tags = list(self.tag_counts)

        #initialization
        for t in range(1, len(tags)):
            tag = tags[t]
            
            initial_transition_prob = self.tag_bigrams[('<START>', tag)] / self.tag_counts['<START>']
            
            if np.isnan(feature_vectors[0]).any():
                initial_maxent_log_prob = 0
            else:
                initial_maxent_log_prob = self.classifier.predict_log_proba([feature_vectors[0]])[0][t-1]
            
            previous_log_scores.append(math.log(initial_transition_prob) + initial_maxent_log_prob)
        
        #iteration
        #w is index of current word
        for w in range(1, feature_vectors.shape[0]):
            
            log_scores = [None, None]
            w_backpointers = []
            max_log_score_final = (float('-inf'), None)
            
            #t is index of current tag
            for t in range(1, len(tags)):
                
                t_backpointer = None
                max_log_score = (float('-inf'), None)

                #j is index of previous tag
                for j in range(1, len(tags)):
                    
                    transition_prob = self.tag_bigrams[(tags[j], tags[t])] / self.tag_counts[tags[j]]
                    if np.isnan(feature_vectors[w]).any():
                        maxent_log_prob = 0
                    else:
                        maxent_log_prob = self.classifier.predict_log_proba([feature_vectors[w]])[0][t-1]
                    
                    log_score = previous_log_scores[j-1] +  math.log(transition_prob) + maxent_log_prob
                    if log_score > max_log_score[0]:
                        max_log_score = (log_score, j)
                        t_backpointer = j
                        
                    if max_log_score[0] > max_log_score_final[0]:
                        max_log_score_final = max_log_score
                    
                
                log_scores[t-1] = max_log_score[0]
                w_backpointers.append(t_backpointer)
                
            previous_log_scores = log_scores
            backpointers.insert(0, w_backpointers)
        
        #backtracking
        max_index = previous_log_scores.index(max(previous_log_scores)) + 1
        output = [tags[max_index]]
    
        if feature_vectors.shape[0] == 1:
            return output
        
        max_index = max_log_score_final[1]
        for bptrs in backpointers:
            max_index = bptrs[max_index-1]
            output.insert(0, tags[max_index])
            
        return output
    
    

In [322]:
maxent = MaxEnt_Metaphor_Tagger(train_data, 1)

In [323]:
feature_vector = \
    maxent.transform_sentence('he continued , hackles rising .', ['PRON', 'VERB', 'PUNCT', 'NOUN', 'VERB', 'PUNCT'])
maxent.viterbi(feature_vector)

[0, 0, 0, 0, 0, 0]

In [324]:
val_data = pd.read_csv('./data_release/val.csv', encoding='latin-1')
def validate(model, val_data):
    labels = []
    for row in val_data.iterrows():
        sentence = row[1][0]
        feature_vector = model.transform_sentence(sentence, ast.literal_eval(row[1][1]))
        labels += model.viterbi(feature_vector)
    ids = [i for i in range(len(labels))]
    df = pd.DataFrame({'idx': ids, 'label': labels}, columns = ['idx', 'label'])
    return df

In [325]:
validate(maxent, val_data)

Unnamed: 0,idx,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0
