In [1]:
from __future__ import print_function

import os
import numpy as np
import csv
import cPickle
from nltk.tag import StanfordPOSTagger
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
st = StanfordPOSTagger('../Tools/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger','../Tools/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar')
sp = StanfordParser(path_to_jar='../Tools/stanford-parser-full-2017-06-09/stanford-parser.jar',path_to_models_jar='../Tools/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar')
sdp = StanfordDependencyParser(path_to_jar='../Tools/stanford-parser-full-2017-06-09/stanford-parser.jar',path_to_models_jar='../Tools/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [3]:
strong_polar_words = []
weak_polar_words = []
positive_polar_words = []
negative_polar_words = []
with open('../Datasets/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') as f:
    for line in f.readlines():
        if line.split()[0].split('=')[1] == 'weaksubj':
            weak_polar_words.append(line.split()[2].split('=')[1])
        elif line.split()[0].split('=')[1] == 'strongsubj':
            strong_polar_words.append(line.split()[2].split('=')[1])
        if line.split()[5].split('=')[1] == 'positive':
            positive_polar_words.append(line.split()[2].split('=')[1])
        elif line.split()[5].split('=')[1] == 'negative':
            negative_polar_words.append(line.split()[2].split('=')[1])

In [4]:
liwc_lexicons = []

with open('../Datasets/LIWC2007dictionary.csv') as f:
    reader = csv.reader(f)
    reader.next()
    reader.next()
    reader.next()
    for row in reader:
        ## Feel
        if row[68] != '':
            liwc_lexicons.append(row[68])
        
        ## Swear
        if row[26] != '':
            liwc_lexicons.append(row[26])
            
        ## Certain
        if row[60] != '':
            liwc_lexicons.append(row[60])
            
        ## Percept
        if row[64] != '':
            liwc_lexicons.append(row[64])
            
        if row[65] != '':
            liwc_lexicons.append(row[65])
            
        ## Time
        if row[87] != '':
            liwc_lexicons.append(row[87])
            
        if row[88] != '':
            liwc_lexicons.append(row[88])
liwc_lexicons = [i if '*' not in i else i[:-1] for i in liwc_lexicons]

In [5]:
FEATURES = ["strong_polar_words","weak_polar_words","root_verb","acomp","xcomp","advmod","modals","pronouns","LIWC"]

def extract_features(text):
    features = np.zeros(len(FEATURES))
    words = text.split()
    stemmed_words = [stemmer.stem(w) for w in words]
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words]
    
    pos_tags = [i[1] for i in st.tag(words)]
    dep_triplets = list(sdp.raw_parse(text).next().triples())
    deps = [i[1] for i in dep_triplets]
    root_verb = [parse.tree() for parse in sdp.raw_parse(text)][0].label()

    ## strong and weak polar words
    features[FEATURES.index("strong_polar_words")] = 0
    features[FEATURES.index("weak_polar_words")] = 0
    for i in range(len(words)):
        if words[i] in strong_polar_words or stemmed_words[i] in strong_polar_words or lemmatized_words[i] in strong_polar_words:
            features[FEATURES.index("strong_polar_words")] += 1
        elif words[i] in weak_polar_words or stemmed_words[i] in weak_polar_words or lemmatized_words[i] in weak_polar_words:
            features[FEATURES.index("weak_polar_words")] += 1
    
    ## polarity of root verb
    features[FEATURES.index("root_verb")] = 1 if root_verb in positive_polar_words or stemmer.stem(root_verb) in positive_polar_words or lemmatizer.lemmatize(root_verb) in positive_polar_words else -1 if root_verb in negative_polar_words or stemmer.stem(root_verb) in negative_polar_words or lemmatizer.lemmatize(root_verb) in negative_polar_words else 0
    
    ## presence of aComp, xComp and advMod dependencies
    features[FEATURES.index("acomp")] = 1 if 'acomp' in deps else 0
    features[FEATURES.index("xcomp")] = 1 if 'xcomp' in deps else 0
    features[FEATURES.index("advmod")] = 1 if 'advmod' in deps else 0
    
    ## opionated n-grams
    
    
    ## presence of modal verbs
    features[FEATURES.index("modals")] = 1 if 'MD' in pos_tags else 0
    
    ## presence of pronouns
    features[FEATURES.index("pronouns")] = 1 if "PRP" in pos_tags or "PRP$" in pos_tags or "WP" in pos_tags or "WP$" in pos_tags else 0
    
    ## opinionated words (?)    
    ## LIWC features
    features[FEATURES.index("LIWC")] = len([i for i in range(len(words)) if words[i] in liwc_lexicons or stemmed_words[i] in liwc_lexicons or lemmatized_words[i] in liwc_lexicons])
    
    return features

In [6]:
mpqa_texts = []
mpqa_labels = []
for root, dirs, files in os.walk('../Datasets/mpqa535/'):
    for f in files:
        with open(os.path.join(root,f)) as csvfile:
            reader = csv.reader(csvfile)
            reader.next()
            for row in reader:
                mpqa_texts.append(extract_features(row[0]))
                mpqa_labels.append(0 if row[1] == 'f' else 1)

AssertionError: 

In [9]:
yahoo_texts = []
yahoo_labels = []
for root, dirs, files in os.walk('../Datasets/120/'):
    for f in files:
        with open(os.path.join(root,f)) as csvfile:
            reader = csv.reader(csvfile)
            reader.next()
            for row in reader:
                yahoo_texts.append(extract_features(row[0]))
                yahoo_labels.append(0 if row[1] == 'f' else 1)

In [12]:
np_mpqa_texts = np.asarray(mpqa_texts)
np_mpqa_labels = np.asarray(mpqa_labels)

np_yahoo_texts = np.asarray(yahoo_texts)
np_yahoo_labels = np.asarray(yahoo_labels)

In [17]:
np.save('np_mpqa_texts.npy',np_mpqa_texts)
np.save('np_mpqa_labels.npy',np_mpqa_labels)
np.save('np_yahoo_texts.npy',np_yahoo_texts)
np.save('np_yahoo_labels.npy',np_yahoo_labels)