In [1]:
from __future__ import print_function

import os
import sys, csv
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
texts = []
labels = []
for root, dirs, files in os.walk('../Datasets/mpqa535/'):
    for f in files:
        with open(os.path.join(root,f)) as csvfile:
            reader = csv.reader(csvfile)
            reader.next()
            for row in reader:
                texts.append(row[0])
                labels.append(0 if row[1] == 'f' else 1)

for root, dirs, files in os.walk('../Datasets/120/'):
    for f in files:
        with open(os.path.join(root,f)) as csvfile:
            reader = csv.reader(csvfile)
            reader.next()
            for row in reader:
                texts.append(row[0])
                labels.append(0 if row[1] == 'f' else 1)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [3]:
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_true, y_pred, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def fmeasure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=1)

from keras.models import load_model

model = load_model("../Projects/OpinionDetection/Notebooks/lstm_dep_all.h5",custom_objects={'precision': precision, 'recall':recall, 'fmeasure':fmeasure})

In [5]:
import nltk.data
import os
from nltk.tokenize import sent_tokenize
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

with open('result_guardian.csv','w') as csvfile:
    writer = csv.writer(csvfile)
    for root, dirs, files in os.walk('../Datasets/GuardianNews/'):
        for f in files:
            with open(os.path.join(root,f)) as article:
                sents = sent_tokenize(article.read())
                sequences = tokenizer.texts_to_sequences(sents)
                X = pad_sequences(sequences, maxlen=141)
                scores = model.predict(X).reshape(-1)
                top5Index = scores.argsort()[-5:][::-1]
                out = [f,np.mean(scores)]
                for i in top5Index:
                    out.append(sents[i])
                writer.writerow(out)