# Putting It All Together

Finally, we're going to fit all of our code together! Most of this will be code that you wrote. We're going to glue it all together and provide you with a few final pieces --- a function to train a classifier, a scorer. Ask lots of questions, and be really proud of how much you've done!

In [96]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import wordnet
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
import csv
import os
import re
from nltk.stem import WordNetLemmatizer
from math import log
from gensim.models.keyedvectors import KeyedVectors
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [97]:
train_body_path = "train_bodies.csv"
if not os.path.exists(train_body_path):
    print("Check location for train_bodies")
test_body_path = "competition_test_bodies.csv"
if not os.path.exists(test_body_path):
    print("Check location for test_bodies")
train_stance_path = "train_stances.csv"
if not os.path.exists(train_stance_path):
    print("Check location for train_stances")
test_headline_path = "competition_test_stances.csv"
if not os.path.exists(test_headline_path):
    print("Check location for test_stances_unlabeled")

In [None]:
word2vec = {}

def initialize():
    global word2vec
    if len(word2vec) == 0:
        print('loading word2vec...')
        word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        for word in word_vectors.vocab:
            word2vec[lemmatizer.lemmatize(clean(word))] = word_vectors[word]
    print('word2vec loaded')

In [None]:
global word2vec
initialize()

## What we had before

Here's our cleaning and loading code! The only change we've made is adding a function for putting test data into a list of tuples, much like our training stances. You don't have to do anything anywhere, but read through it! Be impressed by what you've done :)

In [98]:
# Here's the code we've written so far!
# You don't have to do anything, but read it over and make sure you remember what each function is doing

def clean(s):
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def w_tokenize(s):
    return nltk.word_tokenize(s)

def s_tokenize(p):
    return nltk.sent_tokenize(p)

def lemmatize(word_tokens):
    return [lemmatizer.lemmatize(t) for t in word_tokens]

def remove_stopwords(word_tokens):
    return [w for w in word_tokens if not w in stop_words]

def w_super_clean(s):
    return remove_stopwords(lemmatize(w_tokenize(clean(s))))

def s_super_clean(p):
    sentences = s_tokenize(p)
    clean_sentences = []
    for s in sentences:
        clean_sentences.append(" ".join(remove_stopwords(lemmatize(w_tokenize(clean(s))))))
    return clean_sentences

In [None]:
# Here's our load body function from before
# Again, you don't need to do anything, but read through and ask if any lines confuse you
def load_body(filename):
    id2body = {} 
    id2body_sentences = {} 
    
    # These lines open the file and read in each row
    with open(filename, encoding='utf-8', errors='ignore') as fh:
        reader = csv.DictReader(fh)
        data = list(reader)
        for row in data:
            
            # This line gets the Body ID for this row
            id = row['Body ID']
            # This line gets the article body
            body = str(row['articleBody'])
            # This line strips leading and trailing spaces from the body
            body = body.strip()
            
            # Cleaning words and sentences
            body_words = w_super_clean(body) 
            body_sentences = s_super_clean(body)
            
            # Adding to the two dictionaries
            id2body[id] = body_words
            id2body_sentences[id] = body_sentences
    
    return id2body, id2body_sentences

# Load in headlines - body pairs and their stances for training
def load_stance(filename):
    stances = []
    
    with open(filename, encoding='utf-8', errors='ignore') as fh:
        reader = csv.DictReader(fh)
        data = list(reader)
        for row in data:
            clean_title = w_super_clean(row['Headline'])
            stances.append((clean_title, row['Body ID'], row['Stance'].strip()))
    return stances

# Load in headlines - body pairs without stances for testing
def load_test(filename):
    test = []
    
    with open(filename, encoding='utf-8', errors='ignore') as fh:
        reader = csv.DictReader(fh)
        data = list(reader)
        for row in data:
            clean_title = w_super_clean(row['Headline'])
            test.append((clean_title, row['Body ID']))
    return test

In [99]:
# Our counting dictionary function
def dictionary_count(count_items, count_dictionary):
    for item in count_items:        
        if item in count_dictionary:
            count_dictionary[item] += 1
        else: 
            count_dictionary[item] = 1
    
    return count_dictionary

# Our duplicate-eliminating function
def elim_dupes(items):
    new_list = []
    for item in items:
        if item not in new_list:
            new_list.append(item)
    return new_list

# Our function for making the idf!
def prepare_idf(corpus):
    docs_containing = {}
    idf = {}
    for (body_id, body) in corpus.items():
        no_dupes = elim_dupes(body)
        docs_containing = dictionary_count(no_dupes, docs_containing)
    for word in docs_containing:
        idf[word] = log(len(corpus) / docs_containing[word])
    return idf

In [100]:
# We have to run this once so we can test our word overlap function
id2body, id2body_sentences = load_body(train_body_path) 
idf = prepare_idf(id2body)
train_stances = load_stance(train_stance_path)

## Calculating overlap

Here's the function we wrote Monday to calculate word overlap. 

In [105]:
# This function returns an array representing the overlap between title and body
def get_word_overlaps(title, body, trim_len=None):
    words_in_body = {}
    words_in_title = {}
    
    #TODO: use dictionary_count to count words in body
    words_in_body = dictionary_count(body[:trim_len], words_in_body)
    #TODO: use dictionary_count to count words in title
    words_in_title = dictionary_count(title, words_in_title)

    maximum_scaled = 0.0
    # TODO: get the maximum word overlap count by taking the length of the title
    maximum_count = len(title)
    
    for (word, title_word_count) in words_in_title.items():
        # TODO: calculate maximum possible scaled overlap by multiplying the count of each word times its idf
        if word in idf:
            maximum_scaled += title_word_count * idf[word]
        else: 
            maximum_scaled += title_word_count

    overlap_scaled = 0
    overlap_count = 0
    
    for (word, title_word_count) in words_in_title.items():
        # TODO: check if this word is in the body
        if word in words_in_body:
            # TODO: get the number of overlaps by finding the minumum between how many times it appears in the title and the body
            tf = min(title_word_count, words_in_body[word])
            
            # TODO: add the number of overlaps to the overlap count
            overlap_count += tf
            # TODO: scale the number of overlaps by the idf of the word and add it to the scaled overlap count
            if word in idf:
                overlap_scaled += tf * idf[word]
            else:
                overlap_scaled += tf

    # TODO: divide the scaled overlap by the maximum possible scaled overlap
    scaled_over_max = overlap_scaled / maximum_scaled
    # TODO: divide the overlap count by the maximum possible overlap count
    count_over_max = overlap_count / maximum_count
    
    # TODO: return a vector of overlap_count, count_over_max, overlap_scaled, and scaled_over_max
    return [overlap_count, count_over_max, overlap_scaled, scaled_over_max]

## Semantic Similarity

And here's a function to calculate semantic similarity. Do you remember this function from yesterday? Do you remember what each line does? What does the output look like?

In [106]:
def sentence2vector(sentence, word2vec):
    vector = np.array([0.0] * 300)
    count = 0
    for word in sentence:
        if word in word2vec:
            vector += word2vec[word]
            count += 1
    if count > 0:
        vector /= count
        vector /= np.linalg.norm(vector)
    return vector

def get_semantic_similarities(title, body_sentences):
    title_vector = sentence2vector(title, word2vec)
    max_sim = -1
    best_vector = np.array([0.0] * 300)

    supports = []
    for sub_body in body_sentences:
        sub_body_vector = sentence2vector(sub_body, word2vec)
        similarity = 0
        for i in range(300):
            similarity += title_vector[i] * sub_body_vector[i]
        if similarity > max_sim:
            max_sim = similarity
            best_vector = sub_body_vector

        supports.append(similarity)

    features = [max(supports), min(supports)]

    for v in best_vector:
        features.append(v)
    for v in title_vector:
        features.append(v)
    return features

Here's an updated version of extract_features. We have get_word_overlap like from before, but we're calling it on a trimmed article version as well (like in the challenge part from Monday) and incorporating semantic similarity. Take a peak at the output of the test code beneath.

In [107]:
def extract_features(title, body, body_sentences):
    # TODO: make an empty list to hold the features
    features = []
    
    # TODO: get the word overlap and append it to the feature list
    features += get_word_overlaps(title, body)
    features += get_word_overlaps(title, body, len(title)*4)
    
    features += get_semantic_similarities(title, body_sentences)
    
    # TODO: return the feature vector
    return features

Let's test out extract features! We'll pair headlines and bodies from the first examples. Does the feature vector look right to you? Why is it so long? Take a look at get_semantic_similarities and try to figure it out.

In [108]:
mini_stances = train_stances[:1]
for (title, body_id, stance) in mini_stances:
        body = id2body[body_id]
        features = extract_features(title, body, idf)
        print(title)
        print(body[:25])
        print(features)
        print("------------------------------------------------------")

['police', 'find', 'mass', 'graf', 'least', '15', 'body', 'near', 'mexico', 'town', '43', 'student', 'disappeared', 'police', 'clash']
['danny', 'boyle', 'directing', 'untitled', 'film', 'seth', 'rogen', 'eyed', 'play', 'apple', 'co', 'founder', 'steve', 'wozniak', 'sony', 'steve', 'job', 'biopic', 'danny', 'boyle', 'directing', 'untitled', 'film', 'based', 'walter']
[0, 0.0, 0, 0.0, 0, 0.0, 0, 0.0, 0.65824749307095565, 0.0, 0.021088778935321763, -0.013503400192135808, 0.011792165074367176, 0.0083971184785377427, 0.0051117664415396315, -0.0048224470426220184, -0.079734232450517598, 0.02734136878870715, -0.035052896274549127, 0.054551104106816711, -0.046932816995981362, 0.025246202715028889, -0.073890254828778881, 0.047495001609991892, 0.02221314816334282, 0.005418911206267335, -0.0024461886619384933, 0.0010914608603716594, -0.083609192741234073, -0.065591861453189335, -0.076544863152496881, -0.046455645665065104, 0.049812299165303581, -0.006296467676917915, -0.062317478872074353, -0.08

## Training a classifier

Here, we've written a function to train a classifier for you. We're going to use the xgboost algorithm, which combines different decision trees using a technique known as gradient boosting. This algorithm is very powerful and flexible, which is why we've chosen it! Take a look at some of the parameters.

In [109]:
def train_relatedness_classifier(trainX, trainY):
    xg_train = xgb.DMatrix(trainX, label=trainY)
    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'binary:logistic'
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['nthread'] = 20

    num_round = 1000
    relatedness_classifier = xgb.train(param, xg_train, num_round);

    return relatedness_classifier


## Writing the final function

Look familiar? Here's the final form of the function we had you fill in the skeleton for before. There are a few changes, a few tasks we didn't anticipate. Notice how we break the training up into several parts, actually developing a new classifier for each. We're going to train on the first 1000 examples of each, in the interest of time. If we trained on everything, how might that impact our accuracy? 

There's nothing here for you to do, but you should understand what each line is doing. Read through the code and stick your hand up whenever you're confused.

In [110]:
def make_predictions():
    
    # Load and clean the training body using your function
    id2body, id2body_sentences = load_body(train_body_path)
    # Load and clean the test body using your function
    test_id2body, test_id2body_sentences = load_body(test_body_path)
    # We're going to merge them together --- can you figure out why?
    id2body.update(test_id2body)
    id2body_sentences.update(test_id2body_sentences)
    
    # Prepare the idf
    idf = prepare_idf(id2body)
    
    # Load and clean the headline-article-stance sets for training using the load_stances function
    train_stances = load_stance(train_stance_path)
    
    # Load and clean the headline-article-stance sets for testing using the load_test function
    test = load_test(test_headline_path)

    # Here, we're going to create the training set for agree / disagree
    # First step, extract features from each example in training set and add that vector to train_x
    train_x = []
    for (title, body_id, stance) in train_stances[:1000]:
        body = id2body[body_id]
        body_sentences = id2body_sentences[body_id]
        train_x.append(extract_features(title, body, body_sentences))
    
    # Next step: make train_y a binary label vector that describes whether an example is unrelated
    train_y = []
    for (clean_title, body_id, stance) in train_stances[:1000]:
        train_y.append(int(stance == 'unrelated'))
        
    # And now we train our unrelated-related classifier!
    relatedness_classifier = train_relatedness_classifier(train_x, train_y)

    # Now, we're creating a training set from all of the related data for training our other three categories
    related_train_x = []
    for i in range(len(train_x)): 
        # Check if the label in train_y was "related"
        if train_y[i] == 0:
            related_train_x.append(train_x[i])
    
    # Now, we're going to create a label vector for the discuss label
    related_train_y = []
    for i in range(len(train_x)):
        if train_y[i] == 0:
            related_train_y.append(int(train_stances[i][2] == 'discuss'))
            
    # Aaaand train our classifier
    discuss_classifier = train_relatedness_classifier(related_train_x, related_train_y)

    # Last one! Here, we're going to create a dataset of agree/disagree pairs
    agree_train_x = []
    for i in range(len(train_x)):
        # Check if stance is agree or disagree
        if train_stances[i][2] == 'agree' or train_stances[i][2] == 'disagree':
            agree_train_x.append(train_x[i]) 
    
    # And here, we make a label vector for agree, from the agree/disagree pairs we've seen
    agree_train_y = []
    for i in range(len(train_x)):
        if train_stances[i][2] == 'agree' or train_stances[i][2] == 'disagree':
            agree_train_y.append(int(train_stances[i][2] == 'agree'))
    
    # And train our agree/disagree classifier
    agree_classifier = train_relatedness_classifier(agree_train_x, agree_train_y)

    # Now, let's create our test set, just like we created our training set
    test_x = []
    for (title, body_id) in test[:1000]:
        body = id2body[body_id]
        body_sentences = id2body_sentences[body_id]
        test_x.append(extract_features(title, body, body_sentences))

    # Don't worry about this --- it's creating a matrix for the classifier    
    xg_test = xgb.DMatrix(test_x)
    # Now, we run each classifier on the full test set
    relatedness_pred = relatedness_classifier.predict(xg_test);
    discuss_pred = discuss_classifier.predict(xg_test)
    agree_pred = agree_classifier.predict(xg_test)

    ret, scores = [], []
    # We're going to loop through the three predictions for each example together
    for (pred_relate, pred_discuss, pred_agree) in zip(relatedness_pred, discuss_pred, agree_pred):
        scores.append((pred_relate, pred_discuss, pred_agree))
        # Now we pick out a prediction! For each one, we're going to choose it as the prediction if the classifier
        # Predicted over .5 probability that this is the correct label. If none of them are, we predict disagree.
        # Do you have an idea for a better method? How might we improve this solution?
        if pred_relate >= 0.5:
            ret.append('unrelated')
        elif pred_discuss >= 0.5:
            ret.append('discuss')
        elif pred_agree >= 0.5:
            ret.append('agree')
        else:
            ret.append('disagree')
    return ret, scores
    

In [111]:
# And we're done! Let's test it out. This should run without causing any errors --- then there's a good chance it's all working.
pred, scores = make_predictions()

## Scoring

And finally, we're going to score our attempt. We've imported a scoring function for you, since it would be pretty time-consuming to write it yourself. Definitely take a look, but this code isn't as important to understand. Let's see how we did!

In [112]:
from __future__ import division
import csv
import sys
import pandas as pd

FIELDNAMES = ['Headline', 'Body ID', 'Stance']
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
RELATED = LABELS[0:3]

USAGE = """
FakeNewsChallenge FNC-1 scorer - version 1.0
Usage: python scorer.py gold_labels test_labels

  gold_labels - CSV file with reference GOLD stance labels
  test_labels - CSV file with predicted stance labels

The scorer will provide three scores: MAX, NULL, and TEST
  MAX  - the best possible score (100% accuracy)
  NULL - score as if all predicted stances were unrelated
  TEST - score based on the provided predictions
"""

ERROR_MISMATCH = """
ERROR: Entry mismatch at line {}
 [expected] Headline: {} // Body ID: {}
 [got] Headline: {} // Body ID: {}
"""

SCORE_REPORT = """
MAX  - the best possible score (100% accuracy)
NULL - score as if all predicted stances were unrelated
TEST - score based on the provided predictions

||    MAX    ||    NULL   ||    TEST   ||\n||{:^11}||{:^11}||{:^11}||
"""

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g['Stance'], t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm


def score_defaults(gold_labels):
    """
    Compute the "all false" baseline (all labels as unrelated) and the max
    possible score
    :param gold_labels: list containing the true labels
    :return: (null_score, best_score)
    """
    unrelated = [g for g in gold_labels if g['Stance'] == 'unrelated']
    null_score = 0.25 * len(unrelated)
    max_score = null_score + (len(gold_labels) - len(unrelated))
    return null_score, max_score


def load_dataset(filename):
    data = None
    with open(filename, encoding="utf-8", errors="ignore") as fh:
        reader = csv.DictReader(fh)
        print(reader.fieldnames)
        if reader.fieldnames != FIELDNAMES:
            error = 'ERROR: Incorrect headers in: {}'.format(filename)
            raise FNCException(error)
        else:
            data = list(reader)
    return data

def print_confusion_matrix(cm):
    lines = ['CONFUSION MATRIX:']
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    lines.append("ACCURACY: {:.3f}".format(hit / total))
    print('\n'.join(lines))



gold_labels = load_dataset(test_headline_path)
#test_labels = load_dataset(test_filename)

test_score, cm = score_submission(gold_labels, pred)
null_score, max_score = score_defaults(gold_labels)
print_confusion_matrix(cm)
print(SCORE_REPORT.format(max_score, null_score, test_score))

['Headline', 'Body ID', 'Stance']
CONFUSION MATRIX:
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    21     |     0     |    36     |     4     |
-------------------------------------------------------------
| disagree  |     5     |     0     |    15     |     8     |
-------------------------------------------------------------
|  discuss  |    40     |     0     |    118    |    14     |
-------------------------------------------------------------
| unrelated |     5     |     0     |     5     |    729    |
-------------------------------------------------------------
ACCURACY: 0.868

MAX  - the best possible score (100% accuracy)
NULL - score as if all predicted stances were unrelated
TEST - score based on the provided predictions

||    MAX    ||    NULL   ||    TEST   ||
|| 11651.25  ||  4587.25  ||  345.25   ||



## THE END

Great work, you guys. We appreciate your hard work and patience so much. It's not easy to tackle a challenge of this size as a beginning coder, and you guys did so much great work. We couldn't be prouder :)