# 1. Fake News Challenge

http://www.fakenewschallenge.org/

https://github.com/FakeNewsChallenge/fnc-1-baseline

# 2. Technical Links

## NLTK

http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization

https://www.dataquest.io/blog/natural-language-processing-with-python/

http://www.nltk.org/book/ch03.html


## Tensorflow
https://www.tensorflow.org/tutorials/recurrent

https://www.tensorflow.org/programmers_guide/reading_data

## Sklearn

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

## Markdown

https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet

# 3. Papers

https://www.ijcai.org/Proceedings/16/Papers/408.pdf

https://www.overleaf.com/5276203cwvkhf#/16617343/

In [10]:
import sys
#must add local path to the FNC utils, so we can import and reuse them
sys.path.append('fnc-1-baseline/utils/')

In [11]:
import pandas as pd

def read_data(path='fnc-1-baseline/fnc-1'):
    stances = pd.read_csv(path + '/train_stances.csv')
    stances.set_index('Body ID', inplace=True)s
    
    bodies = pd.read_csv(path + '/train_bodies.csv')
    bodies.set_index('Body ID', inplace=True)
    
    ds = pd.merge(bodies, stances, how='inner', right_index=True, left_index=True)
    
    return ds

In [12]:
from sklearn.model_selection import train_test_split

def get_data_split(ds, test_size = 0.2):
    train, validation = train_test_split(ds, test_size = test_size)
    return train, validation

In [13]:
ds = read_data()
train, validation = get_data_split(ds)
print "Train examples: %d"%len(train)
print "Test examples: %d"%len(validation)

train.head()

Train examples: 39977
Test examples: 9995


Unnamed: 0_level_0,articleBody,Headline,Stance
Body ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1951,An executive engineer at the Central Public Wo...,Government fires employee who skipped work for...,agree
1808,We're just two months away from the Apple Watc...,Video Messaging App Says Audio Recording Of Mi...,unrelated
2509,BEIRUT — Islamic State group fighters seized a...,Turkish president says American weapons drop f...,discuss
1210,Call it Newton’s third law of Apple analysts: ...,Web of confusion as scientists cast doubt on m...,unrelated
320,"Over the weekend, NBC anchor Lester Holt cut t...",Boko Haram 'to release abducted schoolgirls' a...,unrelated


In [21]:
import nltk
import re
from sklearn import feature_extraction
from sklearn import preprocessing
import numpy
    
le = preprocessing.LabelEncoder()
wnl = nltk.WordNetLemmatizer()
lb = preprocessing.LabelBinarizer()

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

def normalize_word(w):
    return wnl.lemmatize(w.lower()).lower()

def tokenize_sentenses(sentences):
    return sentences.apply(lambda s: nltk.word_tokenize(s.decode('utf-8')))

def lemmatize_tokens(series):
    return series.apply(lambda tokens: [normalize_word(t) for t in tokens])

def remove_stopwords(words):
    # Removes stopwords from a list of tokens
    return words.apply(lambda l: [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS])

def get_matrix(train):
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(lowercase=True, stop_words="english")

    matrix = vectorizer.fit_transform(train['Headline'])
    matrix.todense()

def prepare_features(dataset):
    #Usefull link https://www.dataquest.io/blog/natural-language-processing-with-python/
    dataset.loc[:, 'Tokens'] = tokenize_sentenses(train['Headline'])
    dataset.loc[:, 'Lemmas'] = lemmatize_tokens(dataset['Tokens'])
    dataset.loc[:, 'StopRemoved'] = remove_stopwords(dataset['Lemmas'])
    dataset.loc[:, 'Bigrams'] = nltk.ngrams(dataset['Lemmas'], 2)
    
    return dataset

train = prepare_features(train)
train_labels = dense_to_one_hot(le.fit_transform(train['Stance']), 4)
#matrix = lb.fit_transform(train['StopRemoved'])

#Temp solution until we use something that maps index to vector (numpy.array), we can reuse the dense_to_one_hot
matrix = get_matrix(train)

In [None]:
def next_batch(images, labels, batch_size):
    global index_in_epoch
    global epochs_completed
    num_examples = len(images)
    
    start = index_in_epoch
    
    # Go to the next epoch
    if start + batch_size > num_examples:
      # Finished epoch
      epochs_completed += 1
      # Get the rest examples in this epoch
      rest_num_examples = num_examples - start
      images_rest_part = images[start:num_examples]
      labels_rest_part = labels[start:num_examples]
      
      # Start next epoch
      start = 0
      index_in_epoch = batch_size - rest_num_examples
      end = index_in_epoch
      images_new_part = images[start:end]
      labels_new_part = labels[start:end]
      return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
    else:
      index_in_epoch += batch_size
      end = index_in_epoch
    return images[start:end], labels[start:end]

In [None]:
import tensorflow as tf
    
index_in_epoch = 0
epochs_completed = 0
    
    
def train_algo(train_news):
    
    input_size = matrix.shape[1]
    output_size = 4
    x = tf.placeholder(tf.float32, shape=[None, input_size])
    y_ = tf.placeholder(tf.float32, shape=[None, output_size])

    W = tf.Variable(tf.zeros([input_size,output_size]))
    b = tf.Variable(tf.zeros([output_size]))

    y = tf.matmul(x,W) + b 
    
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
    #+ Regularization
    #0.01*tf.nn.l2_loss(hidden_weights) +
    #0.01*tf.nn.l2_loss(hidden_biases) +
    #0.01*tf.nn.l2_loss(out_weights) +
    #0.01*tf.nn.l2_loss(out_biases)
    
    train_step = tf.train.GradientDescentOptimizer(1e-10).minimize(cross_entropy)
    
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        for i in range(20000):
            batch_xs, batch_ys = next_batch(train_news, train_labels, 500)
            
            sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
            if i%100 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x:batch_xs, y_: batch_ys})
                print("step %d, training accuracy %g"%(i, train_accuracy))

    #correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    #accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    #print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
train_algo(matrix.todense())
