In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import random
from data.scorer import score_submission, print_confusion_matrix, score_defaults, SCORE_REPORT
from scipy import sparse
from scipy.spatial.distance import cosine  # TODO may have to implement own maths formulas
from scipy.stats import entropy  # (kl-divergence) TODO may have to implement own maths formulas
from sklearn.linear_model import LogisticRegression  # TODO implement own classifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # TODO implement own tokenizer + count + tf-idf
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder  # TODO implement my own mapping
from tqdm import tqdm

In [None]:
# Load data from CSV
f_bodies = open('data/train_bodies.csv', 'r', encoding='utf-8')
csv_bodies = csv.DictReader(f_bodies)
bodies = []
for row in csv_bodies:
    body_id = int(row['Body ID'])
    if (body_id + 1) > len(bodies):
        bodies += [None] * (body_id + 1 - len(bodies))
    bodies[body_id] = row['articleBody']
f_bodies.close()
body_inverse_index = {bodies[i]: i for i in range(len(bodies))}

all_unrelated, all_discuss, all_agree, all_disagree = [], [], [], []  # each article = (headline, body, stance)

f_stances = open('data/train_stances.csv', 'r', encoding='utf-8')
csv_stances = csv.DictReader(f_stances)
for row in csv_stances:
    body = bodies[int(row['Body ID'])]
    if row['Stance'] == 'unrelated':
        all_unrelated.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'discuss':
        all_discuss.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'agree':
        all_agree.append((row['Headline'], body, row['Stance']))
    elif row['Stance'] == 'disagree':
        all_disagree.append((row['Headline'], body, row['Stance']))
f_stances.close()

In [None]:
# Split to train and validation
print('\tUnrltd\tDiscuss\t Agree\tDisagree')
print('All\t', len(all_unrelated), '\t', len(all_discuss), '\t', len(all_agree), '\t', len(all_disagree))

train_unrelated = all_unrelated[:len(all_unrelated) * 9 // 10]
train_discuss = all_discuss[:len(all_discuss) * 9 // 10]
train_agree = all_agree[:len(all_agree) * 9 // 10]
train_disagree = all_disagree[:len(all_disagree) * 9 // 10]

val_unrelated = all_unrelated[len(all_unrelated) * 9 // 10:]
val_discuss = all_discuss[len(all_discuss) * 9 // 10:]
val_agree = all_agree[len(all_agree) * 9 // 10:]
val_disagree = all_disagree[len(all_disagree) * 9 // 10:]

print('Train\t', len(train_unrelated), '\t', len(train_discuss), '\t', len(train_agree), '\t', len(train_disagree))
print('Valid.\t', len(val_unrelated), '\t', len(val_discuss), '\t', len(val_agree), '\t', len(val_disagree))

In [None]:
train_all = train_unrelated + train_discuss + train_agree + train_disagree  # each article = (headline, body, stance)
random.Random(0).shuffle(train_all)
train_all = np.array(train_all)

val_all = val_unrelated + val_discuss + val_agree + val_disagree
random.Random(0).shuffle(val_all)
val_all = np.array(val_all)

print('Train (Total)', train_all.shape, '\tValidation (Total)', val_all.shape)
print(np.count_nonzero(train_all[:, 2] == 'unrelated'), '\t',
      np.count_nonzero(train_all[:, 2] == 'discuss'), '\t',
      np.count_nonzero(train_all[:, 2] == 'agree'), '\t',
      np.count_nonzero(train_all[:, 2] == 'disagree'))
print(np.count_nonzero(val_all[:, 2] == 'unrelated'), '\t',
      np.count_nonzero(val_all[:, 2] == 'discuss'), '\t',
      np.count_nonzero(val_all[:, 2] == 'agree'), '\t',
      np.count_nonzero(val_all[:, 2] == 'disagree'))

In [None]:
# Load GLoVe word vectors
f_glove = open("data/glove.6B.50d.txt", "rb")  # download from https://nlp.stanford.edu/projects/glove/
# also try if higher-dimension/higher-vocabulary GLoVe vectors work better
glove_vectors = {}
for line in tqdm(f_glove):
    glove_vectors[str(line.split()[0]).split("'")[1]] = np.array(list(map(float, line.split()[1:])))
# for key, value in glove_vectors.items():
#    print(key, value)
#    break

In [None]:
glove_vectors['glove']

In [None]:
# Corpus of article bodies and headlines in training dataset
corpus = np.r_[train_all[:, 1], train_all[:, 0]]  # 0 to 44973 are bodies, 44974 to 89943 are headlines

In [None]:
# Convert documents in training set to tf-idf form, to learn idf weights
tf_idf_vec = TfidfVectorizer(stop_words='english')
tf_idf = tf_idf_vec.fit_transform(corpus)
# print(tf_idf[0])
print(tf_idf.shape)  # (2 x no of docs, no of words)

In [None]:
# Create index of column indices in tf_idf -> word
col_to_word = {i[1]: i[0] for i in tf_idf_vec.vocabulary_.items()}
print(list(col_to_word.items())[:5])
# idf = {word: tf_idf_vec.idf_[vocab[word]] for word in tf_idf_vec.vocabulary_}
# print(list(idf.items())[:5])

In [None]:
# Function to convert a document to GloVe vectors, by computing tf-idf of each word * GLoVe of word / total tf-idf for document
def doc_to_glove(doc):
    doc_tf_idf = tf_idf_vec.transform([doc])
    # print(doc_tf_idf[:10])
    _, cols = doc_tf_idf[0].nonzero()
    doc_vector = np.array([0.0]*50)
    sum_tf_idf = 0
    for col in cols:
        word = col_to_word[col]
        if word in glove_vectors:
            # print(word, tf_idf[row, col], glove_vectors[word])
            doc_vector += glove_vectors[word] * doc_tf_idf[0, col]
            sum_tf_idf += doc_tf_idf[0, col]
    doc_vector /= sum_tf_idf
    return doc_vector
print(train_all[0,0])
print(doc_to_glove(train_all[0,0]))

In [None]:
# Compute cosine similarity of GLoVe vectors for all headline-body pairs
def cosine_similarity(doc):
    return 1.0 - cosine(doc_to_glove(doc[0]), doc_to_glove(doc[1]))

for i in range(10):
    # unrelated should have lower than rest
    print(cosine_similarity(train_all[i]), train_all[i, 2])

In [None]:
# Compute the KL-Divergence of language model (LM) representations of the headline and the body
eps = 0.1  # add a small value for every common word in the LM, as KL-divergence won't work if there are 0 values
def kl_divergence(doc):
    # Convert body and headline to bag of words representations
    vec = CountVectorizer(stop_words='english')
    vec_all = vec.fit_transform([doc[0], doc[1]])
    vec_headline = np.squeeze(np.array(vec_all[0].todense()))
    vec_body = np.squeeze(np.array(vec_all[1].todense()))
    
    # Compute a simple unigram LM of headline and body using bag of words / no. of words in doc
    lm_headline = (vec_headline + eps) / np.sum(vec_headline)
    lm_body = (vec_body + eps) / np.sum(vec_body)
    
    # Return KL-divergence of lm
    return entropy(lm_headline, lm_body)

for i in range(20):
    # unrelated should have lower than rest
    print(cosine_similarity(train_all[i]), train_all[i, 2])

In [None]:
# TODO other features (use shared n-grams, shared entities, polarity of sentences containing entities in body)
# Features should return a float value between 0 and 1

def unigram_match(doc):
    # Returns how many times 1-grams that occur in the article's headline occur on the article's body.
    vec = CountVectorizer(stop_words='english', ngram_range=(1, 1))
    vec.fit([doc[0]])
    vec_body = vec.transform([doc[1]])
    # Make the range 0-1 and log-scaled (difference between 0 and 10 matches > 100 and 110)
    return np.power((np.sum(vec_body) / len(doc[1])), 1 / np.e)
    
for i in range(10):
    # unrelated should have lower than rest
    print(unigram_match(train_all[i]), train_all[i, 2])

In [None]:
def polarity_match(doc):
    # get

In [None]:
# Define function to convert (headline, body) to feature vectors for each document
ftrs = [cosine_similarity, kl_divergence, unigram_match]
def to_feature_array(doc):
    vec = np.array([0.0] * len(ftrs))
    for i in range(len(ftrs)):
        vec[i] = ftrs[i](doc)
    return vec

# Initialise x and y for train dataset
x_train = np.array([to_feature_array(doc) for doc in tqdm(train_all)])
print(x_train[:5])

le = LabelEncoder()
y_train = le.fit_transform(list(train_all[:, 2]))
print(y_train[:5])

In [None]:
# Plot GLoVe distance vs KL-Divergence on a coloured scatter plot with different colours for each label
colours = np.array(['g', 'r', 'b', 'y'])
plt.scatter(list(x_train[:, 0]), list(x_train[:, 1]), c=colours[y_train])
plt.xlabel('Cosine Similarity of GLoVe vectors')
plt.ylabel('KL Divergence of Unigram LMs')
print([(colours[i], le.classes_[i]) for i in range(len(le.classes_))])
plt.show()

In [None]:
# Initialise x and y for validation dataset
x_val = np.array([to_feature_array(doc) for doc in tqdm(val_all)])
print(x_val[:5])

In [None]:
# TODO implement linear/logistic regression classifier using these features. Optimise params on validation set
clf = LogisticRegression(C=1e2, random_state=0, multi_class='multinomial', solver='saga')
# clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)  # TODO temporary classifier
clf.fit(x_train, y_train)

In [None]:
# Predict y for validation set
y_pred = clf.predict(x_val)
print(y_pred[:5])

In [None]:
# Prepare dataset format for score_submission in scorer.py
predicted = le.inverse_transform(y_pred)
body_ids = [str(body_inverse_index[body]) for body in val_all[:, 1]]
pred_for_cm = np.array([{'Headline': val_all[i, 0], 'Body ID': body_ids[i], 'Stance': predicted[i]} for i in range(len(val_all))])
gold_for_cm = np.array([{'Headline': val_all[i, 0], 'Body ID': body_ids[i], 'Stance': val_all[i, 2]} for i in range(len(val_all))])

In [None]:
# Score using scorer.py:
test_score, cm = score_submission(gold_for_cm, pred_for_cm)
null_score, max_score = score_defaults(gold_for_cm)
print_confusion_matrix(cm)
print(SCORE_REPORT.format(max_score, null_score, test_score))

In [None]:
# save to .csv in the correct format (see data/scorer.py and data/train_stances.csv)
pred_tosave = np.array([('Headline','Body ID','Stance')] + [(val_all[i, 0], body_ids[i], predicted[i]) for i in range(len(val_all))])
gold_tosave = np.array([('Headline','Body ID','Stance')] + [(val_all[i, 0], body_ids[i], val_all[i, 2]) for i in range(len(val_all))])

# TODO does not work ATM (UnicodeDecodeError) - try 'python scorer.py val_gold.csv val_predicted.csv' in data folder
np.savetxt('data/val_predicted.csv', pred_tosave, delimiter=',', fmt='%s', encoding='utf-8', newline='\n')
np.savetxt('data/val_gold.csv', gold_tosave, delimiter=',', fmt='%s', encoding='utf-8', newline='\n')

In [None]:
# TODO analyse importance of each feature

In [None]:
GradientBoostingClassifier(n_estimators=200)
ACCURACY: 0.834  ||  1604.25  ||  No Added Features (just Cosine and KL-Divergence)
ACCURACY: 0.861  ||  1743.5   ||  'Raw' Unigram Match
ACCURACY: 0.861  ||  1743.5   ||  1+log(x) Unigram Match
ACCURACY: 0.860  ||  1743.5   ||  1-1/e^(x/2) Unigram Match
ACCURACY: 0.884  ||  1834.5   ||  x/len(body) Unigram Match
ACCURACY: 0.873  ||  1790.5   ||  log(x)/log(len(body)) Unigram Match
ACCURACY: 0.884  ||  1834.25  ||  (x/len(body))^1/e Unigram Match
    
LogisticRegression(C=1e5)
ACCURACY: 0.880  ||  1795.5   ||  (x/len(body))^1/e Unigram Match

LogisticRegression(C=1e2, multi_class='multinomial', solver='saga')
ACCURACY: 0.881  ||  1818.5   ||  (x/len(body))^1/e Unigram Match