# Collection of all features

In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
#from vectorize import get_pos_tag, vectorize_word, vectorize_pos_n, get_freqs
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot

## Subjectivity and Polarity using Pattern.en

Values between 0.0 (objective) and +1.0 (subjective). Polarity between -1.0 and 1.0

In [25]:
from pattern.en import sentiment
def subjectivity(sentence):
    # sentiment(sentence): Returns a (polarity, subjectivity)-tuple.
    return sentiment(sentence)[1]

def polarity(sentence): return sentiment(sentence)[0]

In [26]:
subjectivity("in my opinion the cat is the best creature")

0.3

## Sentiment using VADER

Continuous range between -1.0 (extremely negative) and +1.0 (extremely positive)

In [7]:
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sentiment_analyzer_scores(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    return score['compound']

In [8]:
sentiment_analyzer_scores("phone is bad")

-0.5423

In [11]:
analyser = SentimentIntensityAnalyzer()
analyser.polarity_scores("phone is bad")

{'compound': -0.5423, 'neg': 0.636, 'neu': 0.364, 'pos': 0.0}

## Modality Pattern.en

The mood() function returns either INDICATIVE, IMPERATIVE, CONDITIONAL or SUBJUNCTIVE for a given parsed Sentence. See the table below for an overview of moods.

The modality() function returns the degree of certainty as a value between -1.0 and +1.0, where values &gt; +0.5 represent facts. For example, "I wish it would stop raining" scores -0.35, whereas "It will stop raining" scores +0.75. Accuracy is about 68% for Wikipedia texts.

<img src="../pictures/modality_table.png" width="650" height="350" />

In [32]:
from pattern.en import parse, Sentence, parse
from pattern.en import modality, mood
def modal(s):
    s = parse(s, lemmata=True)
    s = Sentence(s)
    return mood(s), modality(s)
modal("Some amino acids tend to be acidic while others may be basic.")

('indicative', 0.1111111111111111)

## Readability using Flesch-Kincaid Grade Level Formula

0.39 * (total words/ total sentences) + 11.8 (total syllables/ total words) -15.59 = US grade level readability

In [44]:
from nltk.corpus import cmudict
d = cmudict.dict()

def nsyl(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]]
    except KeyError:
        #if word not found in cmudict
        return syllables(word)
def syllables(word):
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

def FKGL(s, total_sents=1):
    try: num_syllables = nsyl(s)[0]
    except: num_syllables = nsyl(s)
    total_words = len(s.split())
    return 0.39*total_words/total_sents + 11.8*num_syllables/total_words - 15.59

## Opening All Files for Corpus Features

Open all files for assertives, entailments, hedges etc.

In [69]:
chrispath = "/Users/chrisdwyer/Bias_Detection/2013_Linguistic_Models_Paper/npov-edits/LinguisticModels_bias_detection"
darcypath = 'asdf'
path_to_use = chrispath

bias_lexicon_file = open(path_to_use+'/bias-lexicon/bias-lexicon.txt','r')
implicatives_file = open(path_to_use+'/bias_related_lexicons/implicatives_karttunen1971.txt','r')
assertives_file = open(path_to_use+'/bias_related_lexicons/assertives_hooper1975.txt','r')
factives_file = open(path_to_use+'/bias_related_lexicons/factives_hooper1975.txt','r')
hedges_file = open(path_to_use+'/bias_related_lexicons/hedges_hyland2005.txt','r')
other_file = open(path_to_use+'/bias_related_lexicons/other_lexicons.txt','r')
report_verbs_file = open(path_to_use+'/bias_related_lexicons/report_verbs.txt','r')
entailments_file = open(path_to_use+'/entailments/reverb_global_clsf_all_tncf_lambda_0.1.txt','r')
strong_subjectives_file = open(path_to_use+'/subjectivity_clues/strongsubj.csv','r')
weak_subjectives_file = open(path_to_use+'/subjectivity_clues/weaksubj.csv','r')

bias_lexicon = bias_lexicon_file.read().strip().split('\n')
assertives = assertives_file.read().strip().split('\n')[7:]
factives = factives_file.read().strip().split('\n')[7:]
hedges = hedges_file.read().strip().split('\n')[7:]
other_lexicon = other_file.read().strip().split('\n')
report_verbs = report_verbs_file.read().strip().split('\n')[9:]
entailments_prestrip = entailments_file.read().strip().split('\n')

# Strong/weak subjectives
# TODO: Word, Priorpolarity (PP) headers
strong_subjectives = list(set(strong_subjectives_file.read().strip().split('\n')))
weak_subjectives = list(set(weak_subjectives_file.read().strip().split('\n')))
strong_subjectives_withPP = [strong_subjectives[i].split(',') for i in range(len(strong_subjectives))]
weak_subjectives_withPP = [weak_subjectives[i].split(',') for i in range(len(weak_subjectives))]
strong_subjectives_list, weak_subjectives_list = [], []
for ss_row, ws_row in zip(strong_subjectives_withPP, weak_subjectives_withPP):
    strong_subjectives_list.append(ss_row[0])
    weak_subjectives_list.append(ws_row[0])

## Entailment Data

Use Entailment function to clean data and get it into a list format.

In [68]:
def entailment_sorter(arr, length_entailing_predicate = 1, orderXY=True):
    '''
    Takes entailment dataset and distills it into usable information. Use params to get
    the output you want. X 'word' Y = True means first argument is X, second is Y. False
    means first argument is Y and second is X. 
    If orderXY = True it includes the last 2 headers:
    Entailing Predicate, Entailed Predicate, X.Y=T/F Entailing Pred., X.Y=T/F Entailed Pred.
    '''
    # TODO: what happens when we want a longer length_entailing_predicate?
    if orderXY:
        data = []
        for e in arr:
            x, y = e.split('\t')
            if len(x.split()) <= length_entailing_predicate:
                x_arg, y_arg = True, True
                if '@R@' in x: x_arg = False
                if '@R@' in y: y_arg = False
                data.append([x.replace('@R@',''), y.replace('@R@',''), x_arg, y_arg])
        df = pd.DataFrame(data, columns=['Entailing Predicate','Entailed Predicate',
                                         'X.Y=T/F Entailing Pred.','X.Y=T/F Entailed Pred.'])
        return df
    else:
        data = []
        for e in arr:
            x, y = e.split('\t')
            if len(x.split()) <= length_entailing_predicate:
                data.append([x.replace('@R@',''), y.replace('@R@','')])
        df = pd.DataFrame(data, columns=['Entailing Predicate','Entailed Predicate'])
        return df
    
entailments = entailment_sorter(entailments_prestrip, length_entailing_predicate = 1, 
                                orderXY=True)
entailing_predicates = list(entailments['Entailing Predicate'])

## Vectorize Corpus Features: isInList()

In [70]:
def isInList(dictionaries, word, n_gram):
    '''
    Pass in array of dictionaries, word under instpection and n_gram of words - 
    either [3,4,5]-gram length.
    Returns True/False vector if word and if surrounding words are in the dictionary. 
    Vector length is 2 x (# of dictionaries), first T/F is if word is in dictionary, second
    T/F if any of the immediately surrounding word(s) is in dictionary.
    Make sure you input dictionaries in the correct order.
    True = 1, False = 0
    '''
    tf_vector = []
    len_ngram, words_ngram = len(n_gram.split()), np.array(n_gram.split())
    surrounding_words = []
    if len_ngram == 3:
        if word == words_ngram[0]: surrounding_words.append(words_ngram[1])
        else: surrounding_words.append(words_ngram[-2])
    elif len_ngram == 4:
        # n_gram is 4 words long, target word is either in position 2 or 3 
        word_index = np.where(word == words_ngram)[0]
        if 1 in word_index: # target word is 2nd word
            surrounding_words.append(words_ngram[0])
            surrounding_words.append(words_ngram[2])
        elif 2 in word_index: # target word is 3rd word
            surrounding_words.append(words_ngram[1])
            surrounding_words.append(words_ngram[3])
        # only issue is if the target word repeats?
    elif len_ngram == 5:
        # n_gram is 5 words long, target word is in the middle
        surrounding_words.append(words_ngram[1]) 
        surrounding_words.append(words_ngram[3])

    for dictionary in dictionaries:
        if word in dictionary: tf_vector.append(1)
        else: tf_vector.append(0)
        for surrounding_word in surrounding_words:
            if surrounding_word in dictionary:
                tf_vector.append(1)
                break
            else:
                # If last word in surrounding_words list, then neither word is in dictionary
                if surrounding_word == surrounding_words[-1]:
                    tf_vector.append(0)
                    break
    return tf_vector

def isInBiasLexicon(word,dictionary=bias_lexicon):
    if word in dictionary: return 1
    return 0