# 1. Fake News Challenge

http://www.fakenewschallenge.org/

https://github.com/FakeNewsChallenge/fnc-1-baseline

# 2. Technical Links

## NLTK

http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization

https://www.dataquest.io/blog/natural-language-processing-with-python/

http://www.nltk.org/book/ch03.html


## Tensorflow
https://www.tensorflow.org/tutorials/recurrent

https://www.tensorflow.org/programmers_guide/reading_data

## Sklearn

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

## Markdown

https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet

## Keras

https://keras.io/getting-started/sequential-model-guide/

# 3. Papers

https://www.ijcai.org/Proceedings/16/Papers/408.pdf

https://www.overleaf.com/5276203cwvkhf#/16617343/

In [None]:
import sys
FNC_PATH="fnc-1-baseline"

#must add local path to the FNC utils, so we can import and reuse them
sys.path.append(FNC_PATH + '/utils/')

In [None]:
import pandas as pd

def read_data(path=FNC_PATH + '/fnc-1'):
    stances = pd.read_csv(path + '/train_stances.csv')
    stances.set_index('Body ID', inplace=True)
    
    bodies = pd.read_csv(path + '/train_bodies.csv')
    bodies.set_index('Body ID', inplace=True)
    
    ds = pd.merge(bodies, stances, how='inner', right_index=True, left_index=True)
    
    return ds

In [None]:
from sklearn.model_selection import train_test_split

def get_data_split(ds, test_size = 0.2):
    train, validation = train_test_split(ds, test_size = test_size)
    return train, validation

In [None]:
ds = read_data()
train, validation = get_data_split(ds)
print "Train examples: %d"%len(train)
print "Test examples: %d"%len(validation)

train.head()

In [None]:
import nltk
import re
from sklearn import feature_extraction
from sklearn import preprocessing
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
    
le = preprocessing.LabelEncoder()
wnl = nltk.WordNetLemmatizer()
vectorizer = TfidfVectorizer(ngram_range=(1,3), lowercase=True, stop_words="english")

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

def normalize_word(w):
    return wnl.lemmatize(w.lower()).lower()

def tokenize_sentenses(sentences):
    return sentences.apply(lambda s: nltk.word_tokenize(s.decode('utf-8')))

def lemmatize_tokens(series):
    return series.apply(lambda tokens: [normalize_word(t) for t in tokens])

def remove_stopwords(words):
    # Removes stopwords from a list of tokens
    return words.apply(lambda l: [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS])

def prepare_features(dataset):
    #Usefull link https://www.dataquest.io/blog/natural-language-processing-with-python/
    #dataset.loc[:, 'Tokens'] = tokenize_sentenses(train['Headline'])
    #dataset.loc[:, 'Lemmas'] = lemmatize_tokens(dataset['Tokens'])
    #dataset.loc[:, 'StopRemoved'] = remove_stopwords(dataset['Lemmas'])
    #dataset.loc[:, 'TFIDF'] = vectorizer.fit_transform(dataset['Headline']).toarray().tolist()
    #dataset.loc[:, 'PosTags'] = dataset['Tokens'].apply(lambda x : nltk.pos_tag(x))
    
    y = dense_to_one_hot(le.fit_transform(train['Stance']), 4)
    
    return dataset,  vectorizer.fit_transform(dataset['Headline']), y 

train, matrix, train_labels = prepare_features(train)

In [None]:
def train_model(x_train, y_train):
    x_train = x_train.toarray()
    input_size = x_train.shape[1]
    output_size = 4
    
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.optimizers import SGD

    model = Sequential()
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 20-dimensional vectors.
    model.add(Dense(64, activation='relu', input_dim=input_size))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size, activation='softmax'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    model.fit(x_train, y_train,
              epochs=20,
              batch_size=128)
    #score = model.evaluate(x_test, y_test, batch_size=16)
    
train_model(matrix, train_labels)