In [189]:
import pandas as pd
import numpy as np
from vectorize import get_pos_tag, vectorize_word, vectorize_pos_n, get_freqs

train_alldata = pd.read_csv("./final_datasets/train-sents-final1.csv")
dev_alldata = pd.read_csv("./final_datasets/dev-sents-final1.csv")
test_alldata = pd.read_csv("./final_datasets/test-sents-final1.csv")

def data_stripper(arr):
    new_data = arr[:]
    new_data.columns = ['Bias Inducing Word','De-Biased Word','Sentence Pre-Edit','Sentence Post-Edit']
    return new_data[['Bias Inducing Word','Sentence Pre-Edit']]

train,dev,test = data_stripper(train_alldata),data_stripper(dev_alldata),data_stripper(test_alldata)
print(train[5:10])

  Bias Inducing Word                                  Sentence Pre-Edit
5           followed     relatively few parish clergy or laity followed
6       denomination  today the church of ireland is after the roman...
7        institution  the church of ireland came into existence as a...
8          civilians  controversy has arisen numerous times and from...
9             naming  the soviets formed a special commission which ...


In [190]:
def entailment_sorter(arr, length_entailing_predicate = 1, orderXY=True):
    '''
    Takes entailment dataset and distills it into usable information. Use params to get
    the output you want. X 'word' Y = True means first argument is X, second is Y. False
    means first argument is Y and second is X. 
    If orderXY = True it includes the last 2 headers:
    Entailing Predicate, Entailed Predicate, X.Y=T/F Entailing Pred., X.Y=T/F Entailed Pred.
    '''
    # TODO: what happens when we want a longer length_entailing_predicate?
    if orderXY:
        data = []
        for e in arr:
            x, y = e.split('\t')
            if len(x.split()) <= length_entailing_predicate:
                x_arg, y_arg = True, True
                if '@R@' in x: x_arg = False
                if '@R@' in y: y_arg = False
                data.append([x.replace('@R@',''), y.replace('@R@',''), x_arg, y_arg])
        df = pd.DataFrame(data, columns=['Entailing Predicate','Entailed Predicate',
                                         'X.Y=T/F Entailing Pred.','X.Y=T/F Entailed Pred.'])
        return df
    else:
        data = []
        for e in arr:
            x, y = e.split('\t')
            if len(x.split()) <= length_entailing_predicate:
                data.append([x.replace('@R@',''), y.replace('@R@','')])
        df = pd.DataFrame(data, columns=['Entailing Predicate','Entailed Predicate'])
        return df

In [191]:
bias_lexicon_file = open('./bias-lexicon/bias-lexicon.txt','r')
implicatives_file = open('./bias_related_lexicons/implicatives_karttunen1971.txt','r')
assertives_file = open('./bias_related_lexicons/assertives_hooper1975.txt','r')
factives_file = open('./bias_related_lexicons/factives_hooper1975.txt','r')
hedges_file = open('./bias_related_lexicons/hedges_hyland2005.txt','r')
other_file = open('./bias_related_lexicons/other_lexicons.txt','r')
report_verbs_file = open('./bias_related_lexicons/report_verbs.txt','r')
entailments_file = open('./entailments/reverb_global_clsf_all_tncf_lambda_0.1.txt','r')
strong_subjectives_file = open('./subjectivity_clues/strongsubj.csv','r')
weak_subjectives_file = open('./subjectivity_clues/weaksubj.csv','r')
bias_lexicon = bias_lexicon_file.read().strip().split('\n')
assertives = assertives_file.read().strip().split('\n')[7:]
factives = factives_file.read().strip().split('\n')[7:]
hedges = hedges_file.read().strip().split('\n')[7:]
other_lexicon = other_file.read().strip().split('\n')
report_verbs = report_verbs_file.read().strip().split('\n')[9:]
entailments_prestrip = entailments_file.read().strip().split('\n')

# Strong/weak subjectives
# TODO: Word, Priorpolarity (PP) headers
strong_subjectives = list(set(strong_subjectives_file.read().strip().split('\n')))
weak_subjectives = list(set(weak_subjectives_file.read().strip().split('\n')))
strong_subjectives_withPP = [strong_subjectives[i].split(',') for i in range(len(strong_subjectives))]
weak_subjectives_withPP = [weak_subjectives[i].split(',') for i in range(len(weak_subjectives))]
strong_subjectives_list, weak_subjectives_list = [], []
for ss_row, ws_row in zip(strong_subjectives_withPP, weak_subjectives_withPP):
    strong_subjectives_list.append(ss_row[0])
    weak_subjectives_list.append(ws_row[0])

# Using Entailments function
entailments = entailment_sorter(entailments_prestrip, length_entailing_predicate = 1, 
                                orderXY=True)

entailing_predicates = list(entailments['Entailing Predicate'])


In [192]:
def isInList(dictionaries, word, n_gram):
    '''
    Pass in array of dictionaries, word under instpection and n_gram of words - 
    either [3,4,5]-gram length.
    Returns True/False vector if word and if surrounding words are in the dictionary. 
    Vector length is 2 x (# of dictionaries), first T/F is if word is in dictionary, second
    T/F if any of the immediately surrounding word(s) is in dictionary.
    Make sure you input dictionaries in the correct order.
    '''
    tf_vector = []
    len_ngram, words_ngram = len(n_gram.split()), np.array(n_gram.split())
    surrounding_words = []
    if len_ngram == 3:
        if word == words_ngram[0]: surrounding_words.append(words_ngram[1])
        else: surrounding_words.append(words_ngram[-2])
    elif len_ngram == 4:
        # n_gram is 4 words long, target word is either in position 2 or 3 
        word_index = np.where(word == words_ngram)[0]
        if 1 in word_index: # target word is 2nd word
            surrounding_words.append(words_ngram[0])
            surrounding_words.append(words_ngram[2])
        elif 2 in word_index: # target word is 3rd word
            surrounding_words.append(words_ngram[1])
            surrounding_words.append(words_ngram[3])
        # only issue is if the target word repeats?
    else:
        # n_gram is 5 words long, target word is in the middle
        surrounding_words.append(words_ngram[1]) 
        surrounding_words.append(words_ngram[4])

    for dictionary in dictionaries:
        if word in dictionary: tf_vector.append(True)
        else: tf_vector.append(False)
        for surrounding_word in surrounding_words:
            if surrounding_word in dictionary:
                tf_vector.append(True)
                break
            else:
                # If last word in surrounding_words list, then neither word is in dictionary
                if surrounding_word == surrounding_words[-1]:
                    tf_vector.append(False)
    return tf_vector

def isInBiasLexicon(word,dictionary=bias_lexicon):
    if word in dictionary: return True
    return False
    

In [194]:
print(isInList([assertives,factives,hedges,report_verbs,
          entailing_predicates,strong_subjectives_list,weak_subjectives_list],
         'think','a large think was committed'))
print(isInBiasLexicon('murder'))

[True, False, False, False, False, False, True, False, True, False, True, False, True, False]
True


In [195]:
def one_hot_encode(tag, cat_vars=
                    np.array(['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD',
                              'NN','NNS','NNP','NNPS','PDT','POS','PRP','PRP$','RB','RBR',
                              'RBS','RP','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT',
                              'WP','WP$','WRB'])):
    vector = np.zeros(len(cat_vars))
    vector[np.where(tag==cat_vars)] += 1
    return vector
one_hot_encode('CD')

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [196]:
data = [[1,2],[3,4]]
new = []
for row in data:
    new.append(row[0])
new

[1, 3]

In [197]:
a = [1,2,3,4]
a[1],a[3]

(2, 4)

In [198]:
def get_context(pair, tagged_sent):
    index, max_index = tagged_sent.index(pair), len(tagged_sent)-1
    context = ''
    for n in range(-2, 3):
        if (index + n < 0) or (index + n > max_index):
            continue
        else:
            context += tagged_sent[index+n][0] + ' '
    return context

In [242]:
def vectorize_df(df, POS_FREQ_DIST, WORD_FREQ_DIST, corpus_list, bias_lexicon_list):
    df['tagged_sent'] = df.apply(get_pos_tag, axis = 1)
    attiribute_matrix = []
    result_vector = []
    for row in df.index:
        tagged_sent = df['tagged_sent'].iloc[row]
        # FIXME: this definition of word may not match definition in 'Bias Inducing Word'
        for pair in tagged_sent:
            current_vector = []
            current_vector.append(vectorize_word(pair[0], WORD_FREQ_DIST))
            for n in range(-1,2):
                current_vector.append(vectorize_pos_n(pair[1], n, POS_FREQ_DIST))
            corpus_tf = isInList(corpus_list, pair[0], get_context(pair, tagged_sent))
            for tf in corpus_tf: current_vector.append(tf)
            current_vector.append(isInBiasLexicon(pair[0]))
            is_bias = pair[0] == df['Bias Inducing Word'].iloc[row]
            result_vector.append(is_bias)
            attiribute_matrix.append(current_vector)
    return attiribute_matrix, result_vector

In [243]:
POS_FREQ_DIST, WORD_FREQ_DIST = get_freqs("./final_datasets/train-sents-final1.csv")
corpus_list = [assertives,factives,hedges,report_verbs,
          entailing_predicates,strong_subjectives_list,weak_subjectives_list]
bias_lexicon_list = bias_lexicon

In [244]:
a, b = vectorize_df(train[:20], POS_FREQ_DIST, WORD_FREQ_DIST, corpus_list, bias_lexicon_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [245]:
a

[[0,
  0.47216675510375067,
  0.46680784276610515,
  0.3572268555010297,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 [0.07692307692307693,
  0.5715204114432657,
  0.5883886064713951,
  0.7656423763535264,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  True],
 [0,
  0.8102791315906069,
  0.8689522856189523,
  0.8068777732956837,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 [0,
  0.4829563366960908,
  0.4847978988603988,
  0.49666331802525826,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 [0,
  0.4829563366960908,
  0.4847978988603988,
  0.49666331802525826,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,

In [237]:
x = []


SyntaxError: invalid syntax (<ipython-input-237-c97f7cecfd4b>, line 2)

In [236]:
x[0]

[1, 2, 3, 4]