In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import random
from scipy import sparse
from scipy.spatial.distance import cosine
from scipy.stats import entropy
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [None]:
# Load data from CSV
f_bodies = open('data/train_bodies.csv', 'r', encoding='utf-8')
csv_bodies = csv.DictReader(f_bodies)
bodies = []
for row in csv_bodies:
    body_id = int(row['Body ID'])
    if (body_id + 1) > len(bodies):
        bodies += [None] * (body_id + 1 - len(bodies))
    bodies[body_id] = row['articleBody']
f_bodies.close()

all_unrelated, all_discuss, all_agree, all_disagree = [], [], [], []  # each article = (headline, body, stance)

f_stances = open('data/train_stances.csv', 'r', encoding='utf-8')
csv_stances = csv.DictReader(f_stances)
for row in csv_stances:
    body = bodies[int(row['Body ID'])]
    if row['Stance'] == 'unrelated':
        all_unrelated.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'discuss':
        all_discuss.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'agree':
        all_agree.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'disagree':
        all_disagree.append((row['Headline'], body, row['Stance']))
f_stances.close()

In [None]:
# Split to train and validation
print('\tUnrltd\tDiscuss\t Agree\tDisagree')
print('All\t', len(all_unrelated), '\t', len(all_discuss), '\t', len(all_agree), '\t', len(all_disagree))

train_unrelated = all_unrelated[:len(all_unrelated) * 9 // 10]
train_discuss = all_discuss[:len(all_discuss) * 9 // 10]
train_agree = all_agree[:len(all_agree) * 9 // 10]
train_disagree = all_disagree[:len(all_disagree) * 9 // 10]

val_unrelated = all_unrelated[len(all_unrelated) * 9 // 10:]
val_discuss = all_discuss[len(all_discuss) * 9 // 10:]
val_agree = all_agree[len(all_agree) * 9 // 10:]
val_disagree = all_disagree[len(all_disagree) * 9 // 10:]

print('Train\t', len(train_unrelated), '\t', len(train_discuss), '\t', len(train_agree), '\t', len(train_disagree))
print('Valid.\t', len(val_unrelated), '\t', len(val_discuss), '\t', len(val_agree), '\t', len(val_disagree))

train_all = np.array(train_unrelated + train_discuss + train_agree + train_disagree)  # each article = (headline, body, stance)
random.Random(0).shuffle(train_all)
val_all = np.array(val_unrelated + val_discuss + val_agree + val_disagree)
random.Random(0).shuffle(val_all)
print('Train (Total)', train_all.shape, '\tValidation (Total)', val_all.shape)

In [None]:
# Load GLoVe word vectors
f_glove = open("data/glove.6B.50d.txt", "rb")  # download from https://nlp.stanford.edu/projects/glove/
# also try if higher-dimension/higher-vocabulary GLoVe vectors work better
glove_vectors = {}
for line in tqdm(f_glove):
    glove_vectors[str(line.split()[0]).split("'")[1]] = np.array(list(map(float, line.split()[1:])))
# for key, value in glove_vectors.items():
#    print(key, value)
#    break

In [None]:
# Convert documents to tf-idf form
corpus = np.r_[train_all[:, 1], train_all[:, 0]]  # 0 to 44973 are bodies, 44974 to 89943 are headlines
vectoriser = CountVectorizer(stop_words='english')
bag_of_words = vectoriser.fit_transform(corpus)
transformer = TfidfTransformer(smooth_idf=False)
tf_idf = transformer.fit_transform(bag_of_words)
# print(tf_idf[0])
print(tf_idf.shape)  # (2 x no of docs, no of words)

In [None]:
# Convert documents to GloVe vectors, by computing tf-idf of each word * GLoVe of word / total tf-idf for document
doc_vectors = []
col_to_word = vectoriser.get_feature_names()
# print(train_all[0][1])
for row in tqdm(range(tf_idf.shape[0])):  # testing: start from train_all.shape[0]
    _, cols = tf_idf[row].nonzero()
    doc_vector = np.array([0.0]*50)
    sum_tf_idf = 0
    for col in cols:
        word = col_to_word[col]
        if word in glove_vectors:
            # print(word, tf_idf[row, col], glove_vectors[word])
            doc_vector += glove_vectors[word] * tf_idf[row, col]
            sum_tf_idf += tf_idf[row, col]
    doc_vector /= sum_tf_idf
    # print(doc_vector)
    doc_vectors.append(doc_vector)
    # break
doc_vectors = np.array(doc_vectors)
print(doc_vectors.shape)
print(doc_vectors[0])

In [None]:
# Compute cosine similarity of GLoVe vectors for all headline-body pairs
features = []  # indices correspond to train_all
for i in tqdm(range(train_all.shape[0])):
    features.append({'headline_body_similarity': 1.0 - cosine(doc_vectors[i], doc_vectors[i+train_all.shape[0]])})

for i in range(20):
    print(features[i]['headline_body_similarity'], train_all[i][2])  # unrelated should have lower than rest

In [None]:
# Compute the KL-Divergence of language model (LM) representations of the headline and the body

eps = 0.1  # add a small value for every common word in the LM, as KL-divergence won't work if there are 0-values
for i in tqdm(range(train_all.shape[0])):
    # Find 1-grams (columns) that exist in either body or headline
    _, cols_body = bag_of_words[i].nonzero()
    _, cols_headline = bag_of_words[i + train_all.shape[0]].nonzero()
    cols_merged = np.union1d(cols_body, cols_headline)
    # Remove all other columns
    vec_body = np.squeeze(np.array(bag_of_words[i, cols_merged].todense()))
    vec_headline = np.squeeze(np.array(bag_of_words[i + train_all.shape[0], cols_merged].todense()))
    
    # Compute a simple unigram LM using bag of words / no. of words in doc
    lm_body = (vec_body + eps) / np.sum(vec_body)
    lm_headline = (vec_headline + eps) / np.sum(vec_headline)
    
    features[i]['kl_divergence'] = entropy(lm_body, lm_headline)

for i in range(20):
    print(features[i]['kl_divergence'], train_all[i][2])  # unrelated should have higher than rest

In [None]:
# TODO other features (use fnc-baseline's n-grams + entities (maybe polarity of sentences containing entities in body also))

In [None]:
# Initialise x and y for train dataset
le = LabelEncoder()
y_train = le.fit_transform(list(train_all[:, 2]))
print(y_train)

vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(features).todense()
print(x_train[:5])

In [None]:
# TODO Plot GLoVe distance vs KL-Divergence on a coloured scatter plot with different colours for each label
colours = np.array(['b', 'g', 'r', 'y'])
plt.scatter(list(x_train[:, 0]), list(x_train[:, 1]), c=colours[y_train])
plt.xlabel('Cosine Similarity of GLoVe vectors')
plt.ylabel('KL Divergence of Unigram LMs')
print([(colours[i], le.classes_[i]) for i in range(len(le.classes_))])
plt.show()

In [None]:
# TODO implement linear/logistic regression classifier using these features. Optimise params on validation set
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
clf.fit(x_train, y_train)

In [None]:
# TODO test performance of classifier on test set (confusion matrix, precision, recall, F-score) - use scorer
# TODO have to make all of preprocessing a function... from converting to tf-idf form onwards
# and for tf-idf, need a way to calculate tf-idf of each new document without recalculating idf... 
# use existing idf weights from the training set: stackoverflow.com/questions/45232671/obtain-tf-idf-weights-of-words-with-sklearn.


In [None]:
# TODO analyse importance of each feature

In [None]:
# Unused code, could be useful again later

# Generate language model representations of headlines and bodies (bag of words / no. of words in doc for a 1-gram LM)
# Sparse matrix division, because regular '/' would convert it to dense which will run out of memory
# Equivalent to lang_models = bag_of_words/bag_of_words.sum(axis=1)[:, 0] 
# (Credit to https://stackoverflow.com/questions/42225269/scipy-sparse-matrix-division)
b = sparse.bsr_matrix(bag_of_words)
c = sparse.diags(1/b.sum(axis=1).A.ravel())
lang_models = c @ b

print(lang_models.sum(axis=1)) # every element should be 1
print(lang_models.shape, bag_of_words.shape)