In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
from gensim.sklearn_api import W2VTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import numpy as np

In [5]:
train_data = pd.read_csv('./data_release/train.csv', encoding='latin-1')

### POS Seq Transforming

In [123]:
pos_seqs_list_of_lists = [pos_seq.split() for pos_seq in train_data['pos_seq']]

pos_seqs = []
for pos_seq in train_data['pos_seq']:
    pos_seqs += pos_seq.split()

In [128]:
pos_model = W2VTransformer(size=1, min_count=1, seed=1)
# What is the vector representation of the word 'graph'?
posvecs = pos_model.fit(pos_seqs_list_of_lists).transform(pos_seqs)
posvecs = np.array(posvecs)
print(posvecs.shape)

(116622, 1)


### Word token transforming

In [None]:
word_token_list_of_lists = [sentence.lower().split() for sentence in train_data['sentence']]

word_tokens = []
for sentence in train_data['sentence']:
    word_tokens += sentence.lower().split()

In [125]:
word_model = W2VTransformer(size=10, min_count=1, seed=1)
# What is the vector representation of the word 'graph'?
wordvecs = word_model.fit(word_token_list_of_lists).transform(word_tokens)
wordvecs = np.array(wordvecs)
print(wordvecs.shape)

(116622, 10)


In [132]:
vecs = np.concatenate((wordvecs, posvecs), axis=1)
vecs.shape

(116622, 11)

In [100]:
label_seqs = []
for label_seq in train_data['label_seq']:
    label_seqs += ast.literal_eval(label_seq)

In [133]:
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(vecs, label_seqs)

In [134]:
val_data = pd.read_csv('./data_release/val.csv', encoding='latin-1')

In [149]:
x  = clf.predict_log_proba(vecs)
y = clf.predict(vecs)

In [140]:
vecs[i]

array([ 0.56743711,  0.04605469,  0.28873608, -0.58033472,  0.37423232,
       -0.36497143,  0.520953  , -0.00238229,  0.02582435,  0.09245986,
        0.57349592], dtype=float32)

In [121]:
class MaxEnt_Metaphor_Tagger():
    def __init__(self, train_data):
        '''
        train_data: dataframe of word features where the first column is the sentence, second column
            is a string of a list of POS_Sequences, and third columnn is a string of a list of metaphor 
            label sequences (0: not metaphor, 1: metaphor)
        '''
        # Transform POS_Seq
        self.pos_seqs_list_of_lists = [pos_seq.split() for pos_seq in train_data['pos_seq']]
        self.pos_seqs = []
        for pos_seq in train_data['pos_seq']:
            self.pos_seqs += pos_seq.split()
        self.pos_model = W2VTransformer(size=1, min_count=1, seed=1)
        self.posvecs = np.array(self.pos_model.fit(self.pos_seqs_list_of_lists).transform(self.pos_seqs))
        
        
        # Transform Word Tokens
        self.word_token_list_of_lists = [sentence.lower().split() for sentence in train_data['sentence']]
        self.word_tokens = []
        self.sample_indices = []
        for i, sentence in enumerate(train_data['sentence']):
            self.word_tokens += sentence.lower().split()
            self.sample_indices += [i for x in range(len(sentence))]
        
        self.word_model = W2VTransformer(size=10, min_count=1, seed=1)
        self.wordvecs = np.array(self.word_model.fit(self.word_token_list_of_lists).transform(self.word_tokens))
        
        #TF-IDF 
#         self.tfidf_vectorizer = TfidfVectorizer()
#         self.tfidf_vecs = self.tfidf_vectorizer.fit_transform([sentence.lower() for sentence in train_data['sentence']])
#         print('tfidf shape:', self.tfidf_vecs.shape)
#         print(self.tfidf_vecs[0])
#         print('feature names:', self.tfidf_vectorizer.get_feature_names())
        
        # Concatenate Feature Vectors
        self.vecs = np.concatenate((self.wordvecs, self.posvecs), axis=1)
        
#         for i, word_vec in enumerate(self.vecs):
#             word = self.word_tokens[i]
#             sentence_index = self.sample_indices[i]
#             word_index = self.tfidf_vectorizer.get_feature_names().index(word)
#             print('sentence index:', sentence_index)
#             print('word index:', word_index)
#             print(type(self.tfidf_vecs))
#             tfidf = self.tfidf_vecs[sentence_index][word_index]
#             word_vec += tfidf
        
        
        # Create Target Vector
        self.label_seqs = []
        for label_seq in train_data['label_seq']:
            self.label_seqs += ast.literal_eval(label_seq)
        
        # Train MaxEnt classifier
        self.classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').\
            fit(self.vecs, self.label_seqs)
        
    def predict_log_proba(self, vectors):
        return self.classifier.predict_log_proba(vectors)
    
    def predict(self, vectors):
        return self.classifier.predict(vectors)
    
    

In [122]:
maxent = MaxEnt_Metaphor_Tagger(train_data)

tfidf shape: (6323, 12113)
  (0, 1703)	0.398479411026
  (0, 4140)	0.603994705411
  (0, 11021)	0.147714454311
  (0, 1198)	0.234067700161
  (0, 3820)	0.632295268488
sentence index: 0
word index: 1703
<class 'scipy.sparse.csr.csr_matrix'>


IndexError: index (1703) out of range

In [None]:
maxent.tfidf_vecs

In [82]:
12113 * 6323

76590499

In [87]:
train_data['sentence']

0                        Ca n't fail to be entertaining .
1                     How much was he going to tell her ?
2       Up until that news hit the Committee , Don had...
3       Could go on to the rugby and go with them coul...
4       Finally , we went to the office and they gave ...
                              ...                        
6318    In a voice of soft persuasion , she said , Wil...
6319    It is a symptom of public anxiety about urban ...
6320                           I do n't like Miss Fitch .
6321    A fern-like plant , beautifully preserved in a...
6322    And there were never more than a few dozen rin...
Name: sentence, Length: 6323, dtype: object

In [92]:
x = [list(range(0,10))]
for numbers in x:
    numbers[4] = 123123

x

[[0, 1, 2, 3, 123123, 5, 6, 7, 8, 9]]