In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from keras.models import load_model
import matplotlib.pyplot as plt
import pickle
% matplotlib inline

import sys
sys.path.append("..")
from scripts.gen import DataGenerator
from scripts.rnn import singleRNN, dualRNN

Using TensorFlow backend.


In [2]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    fig, ax = plt.subplots(figsize=(10,6))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=0, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [3]:
with open('../data/word2token.pickle', 'rb') as f:
    word2token = pickle.load(f)
    
with open('../data/token2word.pickle', 'rb') as f:
    token2word = pickle.load(f)

In [4]:
with open('../data/embedding_matrix.pickle', 'rb') as f:
    embedding_matrix = pickle.load(f)

with open('../data/embedding_matrix_oov.pickle', 'rb') as f:
    embedding_matrix_oov = pickle.load(f)
    
with open('../data/embedding_matrix_inv.pickle', 'rb') as f:
    embedding_matrix_inv = pickle.load(f)

In [5]:
with open('../data/partition.pickle', 'rb') as f:
    partition = pickle.load(f)

gen_train = DataGenerator(partition, mode='train', all_text=True, train_oov=False, batch_size=1)
gen_valid = DataGenerator(partition, mode='valid', all_text=True, train_oov=False, batch_size=1)
gen_test = DataGenerator(partition, mode='test', all_text=True, train_oov=False, batch_size=1)
gen_test_id = DataGenerator(partition, mode='test', all_text=True, train_oov=False, batch_size=1, return_id=True)

In [7]:
rnn_single = singleRNN(embedding_matrix)
rnn_single.load_weights("../output/rc_tall/weights/epoch_004-valloss_0.11-valacc_0.97.hdf5")

In [8]:
def preds(model, generator):
    preds = model.predict_generator(generator)
    y_pred = preds.round().astype(int)
    
    y_true = np.empty(shape=(len(generator)), dtype=int)    
    for i in range(len(generator)):
        y_true[i] = generator.__getitem__(i)[1]
    
    return y_true, y_pred

def confmat(y_true, y_pred, normalize=False, savefile=None):
    plot_confusion_matrix(y_true, y_pred, classes=np.array(["Fact","Fake"]), normalize=normalize)
    if savefile:
        plt.savefig(savefile, bbox_inches='tight')
    plt.show()

In [9]:
y_true, y_pred = preds(model=rnn_single, generator=gen_test)

In [11]:
# confmat(y_true, y_pred, normalize=True, savefile='./cmat_tall.png')

In [12]:
# confmat(y_true, y_pred, normalize=False, savefile='./cmat_tall_abs.png')

Find examples of wrong predictions for progress presentation:

In [13]:
def printExample(ID):
    claim = partition['test'][ID]['claim']
    rating = partition['test'][ID]['rating']
    
    print("Claim:\n", ' '.join([token2word[t] for t in claim]))
    print("Rating:\n", ' '.join([token2word[t] for t in rating]))

In [13]:
for i in range(gen_test_id.__len__()):
    ID, data = gen_test_id.__getitem__(i)
    y_true = float(data[1])
    y_pred = float(rnn_single.predict(data[0]))
    if round(y_pred) != y_true and y_true == 0 and y_pred>0.9:
        print("Predicted:", y_pred)
        print("Actual:", y_true)
        print("ID:", ID)
        printExample(ID[0])
        print()

Predicted: 0.9906553626060486
Actual: 0.0
ID: ['8907eae5-7ef2-4c1a-7507-16da7d70358b']
Claim:
 says department of justice does not disclose how many americans are on terrorist watch lists
Rating:
 false

Predicted: 0.9393472075462341
Actual: 0.0
ID: ['8907eae5-7ef2-4c1a-7507-16da7d70064f']
Claim:
 abigail spanberger would vote with pelosi for a 32 trillion government takeover of health care nearly doubling the debt
Rating:
 false

Predicted: 0.9880837798118591
Actual: 0.0
ID: ['8907eae5-7ef2-4c1a-7507-16da7d6f88ce']
Claim:
 chuck schumer democrat senate minority leader accused of raping his daughter s 16 year old friend friend then 'committed suicide '
Rating:
 false

Predicted: 0.9999980926513672
Actual: 0.0
ID: ['8907eae5-7ef2-4c1a-7507-16da7d6fdd0f']
Claim:
 on navigator those that according dimaio must accompany the unemployed to look for work had first to be 10 000 then 6000 now it is 3000
Rating:
 5

Predicted: 0.9999880790710449
Actual: 0.0
ID: ['8907eae5-7ef2-4c1a-7507-16da7d70

---

# Use model to predict unknown reviews

In [14]:
from asterixdb.asterixdb import AsterixConnection
import json

con = AsterixConnection(server='http://localhost', port=19002)    

Gather all pre-rated reviews:

In [15]:
with open('../data/partition.pickle', 'rb') as f:
    partition = pickle.load(f)

partition['rated'] = dict()
for k in partition.keys():
    if k != 'predict':
        partition['rated'].update(partition[k])
len(partition['rated'])

25477

Create generator for unrated reviews to be predicted:

In [16]:
gen_unrated = DataGenerator(partition,
                           mode='predict',
                           all_text=True,
                           train_oov=False,
                           batch_size=1,
                           return_id=True)
len(gen_unrated)

34597

Helper function to predict item in generator using model, and format into json-friendly dict:

In [17]:
def predict(model, generator, item):    
    text = generator.__getitem__(item)[1][0]
    score = model.predict(text)[0][0]
    isFake = bool(round(score))

    output = dict()
    output["isFake"] = isFake
    output["score"] = float(score)
    
    return output

Gather all reviews with boolean isFake variable, whether rated or unrated:

In [18]:
reviews = []

In [22]:
partition['rated']

{'8907eae5-7ef2-4c1a-7507-16da7d6f68a0': {'claim': array([   28,     8,  4816,  1758,     6,     1,  3260,    66,   215,
            32,  6072,  1568,   220,     5, 10395,  8775,    79,    39,
           257,     4,   458,   176,  1985,     6,     1,  3221,   106,
            24,  1293,     9,   947,     1,   228,  2133,     4,  2694,
           176,   172,     6,   176,   285,    39,   130,  3251,     5,
           166,   112, 13014,    18,   656,   106,    24,  1293,    94,
             2,   311,     3,  1089,  4127,   105,    41,    19,   111,
           277,    41,    19,   111,   106,    24,  9741]),
  'claim_inv': array([   28,     8,  4816,  1758,     6,     1,  3260,    66,   215,
            32,  6072,  1568,   220,     5, 10395,  8775,    79,    39,
           257,     4,   458,   176,  1985,     6,     1,  3221,   106,
            24,  1293,     9,   947,     1,   228,  2133,     4,  2694,
           176,   172,     6,   176,   285,    39,   130,  3251,     5,
           166

In [23]:
for ID in partition['rated']:
    q = '''
        USE FactMap;

        SELECT r.*
        FROM reviews r
        WHERE r.uid = uuid('{0}');
        '''.format(ID)
    output = con.query(q).results[0]
    output['reviewRating']['isFake'] = partition['rated'][ID]['isFake']
    
    reviews.append(output)

In [208]:
for i in range(len(gen_unrated)):
    ID = gen_unrated.__getitem__(i)[0][0]
    q = '''
        USE FactMap;

        SELECT r.*
        FROM reviews r
        WHERE r.uid = uuid('{}');
        '''.format(ID)
    output = con.query(q).results[0]
    
    pred = predict(model=rnn_single, generator=gen_unrated, item=i)
    output['reviewRating'].update(pred)
    
    reviews.append(output)

Output all reviews to json:

In [212]:
path = "/Users/anders1991/Github/FactMap/Data/claimreviews/claims_rated.json"

with open(path, 'w') as f:
    f.write('\n'.join(json.dumps(r) for r in reviews))

Replace reviews dataset with rated reviews in Asterix (the non-optimal way – inserting each rating individually is actually slower due to I/O overhead):

In [229]:
response = con.query('''
    USE FactMap;
    
    SET `compiler.joinmemory` "128MB";
    
    DROP DATASET urljoin IF EXISTS;
    DROP DATASET fuzzyurljoin IF EXISTS;
    DROP DATASET facturljoin IF EXISTS;
    DROP TYPE PostReviewType IF EXISTS;
    
    DROP DATASET reviews IF EXISTS;
    DROP TYPE ReviewType IF EXISTS;
    
    CREATE TYPE ReviewType as {
        uid: string
    };
    CREATE DATASET reviews(ReviewType)
        PRIMARY KEY uid;
                
    LOAD DATASET reviews
    USING localfs (("path"="localhost:///Users/anders1991/Github/FactMap/Data/claimreviews/claims_rated.json"),("format"="json"));
    ''')

...and rejoin datasets on URL (I warned you it would be painful! :-) ):

In [230]:
response = con.query('''
    USE FactMap;
    SET `compiler.joinmemory` "128MB";
    
    DROP TYPE PostReviewType IF EXISTS;
    CREATE TYPE PostReviewType as {
        r: ReviewType,
        p: SubmissionType
    };

    CREATE DATASET urljoin(PostReviewType)
        PRIMARY KEY r.uid, p.id;

    INSERT INTO urljoin
    SELECT *
    FROM posts p, reviews r
    WHERE r.claimAuthor.claimURL = p.url;
''')

Check results:

In [237]:
response = con.query('''
    USE FactMap;
    
    SELECT u.*
    FROM urljoin u;
    ''')

print('Number of matches:', len(response.results))

response = con.query('''
    USE FactMap;

    SELECT count(distinct r.uid) as c
    FROM urljoin u
    LIMIT 1;
    ''')
print('Number of unique claims:', response.results[0]['c'])

Number of matches: 14325
Number of unique claims: 1652


Expected values are 14325 and 1652, respectively.

...and rejoin datasets with additional fuzzy matching on Twitter/Wikipedia:

In [238]:
response = con.query('''
    USE FactMap;

    SET `compiler.joinmemory` "128MB";

    DROP DATASET fuzzyurljoin IF EXISTS;
    
    CREATE DATASET fuzzyurljoin(PostReviewType)
        PRIMARY KEY r.uid, p.id;
        
    INSERT INTO fuzzyurljoin
        SELECT u.*
        FROM urljoin u
        WHERE
            (similarity_jaccard(word_tokens(lower(p.title)), word_tokens(lower(r.claimReviewed))) > 0.20
            OR similarity_jaccard(word_tokens(lower(p.title)), word_tokens(lower(r.claimReviewed_en))) > 0.20)
            AND (abs(length(r.claimReviewed) - length(p.title)) <= 
                (array_min([length(r.claimReviewed), length(p.title)])) * 0.2)
            AND (array_min([length(r.claimReviewed), length(p.title)]) > 15)
            AND (contains(p.domain, "wikipedia") OR contains(p.domain, "twitter"));
    
    INSERT INTO fuzzyurljoin
    SELECT u.*
    FROM urljoin u
    WHERE
            (
                (edit_distance_contains(lower(p.title), lower(r.claimReviewed), length(r.claimReviewed) * 0.5)[0] 
                    OR edit_distance_contains(lower(r.claimReviewed), lower(p.title), length(p.title) * 0.5)[0])
                OR
                (edit_distance_contains(lower(p.title), lower(r.claimReviewed_en), length(r.claimReviewed_en) * 0.5)[0] 
                    OR edit_distance_contains(lower(r.claimReviewed_en), lower(p.title), length(p.title) * 0.5)[0])
            )
            AND (abs(length(r.claimReviewed) - length(p.title)) > 
                (array_min([length(r.claimReviewed), length(p.title)])) * 0.2)
            AND (array_min([length(r.claimReviewed), length(p.title)]) > 15)
            AND (contains(p.domain, "wikipedia") OR contains(p.domain, "twitter"));
    
    INSERT INTO fuzzyurljoin
    SELECT u.*
    FROM urljoin u
    WHERE NOT (contains(p.domain, "wikipedia") OR contains(p.domain, "twitter"));
    ''')

Check results:

In [239]:
response = con.query('''
    USE FactMap;
    
    SELECT COUNT(*) as total
    FROM fuzzyurljoin f;
    ''')

print('Number of matches:', response.results[0]['total'])

response = con.query('''
    USE FactMap;

    SELECT COUNT(DISTINCT r.uid) unique_claims
    FROM fuzzyurljoin f;
    ''')

print('Number of unique claims', response.results[0]['unique_claims'])

Number of matches: 8202
Number of unique claims 1378


Expected values are 8202 and 1378, respectively.

Finally, we rejoin on the review URLs:

In [240]:
response = con.query('''
    USE FactMap;
    SET `compiler.joinmemory` "128MB";
    
    CREATE DATASET facturljoin(PostReviewType)
        PRIMARY KEY r.uid, p.id;

    INSERT INTO facturljoin
    SELECT *
    FROM posts p, reviews r
    WHERE r.reviewUrl = p.url;
''')

Check results:

In [241]:
response = con.query('''
    USE FactMap;
    
    SELECT COUNT(*) as total
    FROM facturljoin f;
    ''')

print('Number of matches:', response.results[0]['total'])

response = con.query('''
    USE FactMap;

    SELECT COUNT(DISTINCT r.uid) unique_claims
    FROM facturljoin f;
    ''')

print('Number of unique reviews', response.results[0]['unique_claims'])

Number of matches: 19224
Number of unique reviews 8636


Expected values are 19224 and 8636, respectively.

---