In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import gensim
import math
import collections 
import os
import re

import nltk
from nltk.util import ngrams
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import casual_tokenize, word_tokenize, sent_tokenize

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

import seaborn as sns
from gensim.models.doc2vec import LabeledSentence
from collections import Counter

import sklearn
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics import confusion_matrix as CFM

# Functions

## preprocess

In [None]:
def body_stance_join(body, stance):
    data = pd.merge(body, stance, on='Body ID')
    return data

In [None]:
#split the merged training data to training subset and validation set
def train_val_split(data, val_ratio=0.1, seed=3693):
    
    #get the list of stances label
    labels = list(data.Stance.value_counts().to_dict().keys())
    
    #get training subset and validation subset with different Stance label
    train = []
    val = []
    for label in labels:
        index = data[data['Stance'] == label].index.tolist()
        random.seed(3693)
        random.shuffle(index)
        val_num = len(index) // 10
        val_index = index[0:val_num]
        train_index = index[val_num:]
    
        validation_set = data.iloc[val_index]
        training_set = data.iloc[train_index]

        train.append(training_set)
        val.append(validation_set)  
    
    #concat the four subsets of training set and validation set
    train_set = pd.concat(train, axis=0)
    val_set = pd.concat(val, axis=0)
    
    return train_set, val_set

In [None]:
#funstion to merge the train and test set to better preprocess
def merge(train, test):
    data = [train, test]
    data = pd.concat(data, axis=0)
    return data

In [None]:
# function to split the train and test set
def train_test_split(merged, train, test, dtype='body'):
    m_train = train.shape[0]
    train_set = merged[:m_train]
    test_set = merged[m_train:]
    
    if dtype == 'stance':
        test_set = test_set.drop(['Stance'], axis=1)
    return train_set, test_set

In [None]:
# function to lower the bodies and the headlines
def lower(body_data, stance_data):
    body_data.articleBody = body_data.articleBody.str.lower()
    stance_data.Headline = stance_data.Headline.str.lower()
    
    return body_data, stance_data

In [None]:
def tokenize(input):
    token = re.compile("[\w]+(?=n't)|n't|'s|\'m|\'ll|[\w]+|[.?!;,:]")
    tokens = token.findall(input)
    return tokens

In [None]:
#tokenize the bodies and headlines and then remove the stop words
def tokenization_and_remove(body_data, stance_data):
    sents_body = body_data.articleBody.tolist()
    sents_headline = stance_data.Headline.tolist()
    
    result_body = []
    result_headline = []
    stop_word = set(stopwords.words('english'))
    
    for sent in sents_body:
        token_list = [w for w in tokenize(sent) if w not in stop_word]
        result_body.append(token_list)
        
    for sent in sents_headline:
        token_list = [w for w in tokenize(sent) if w not in stop_word]
        result_headline.append(token_list)
        
    body_data.articleBody = result_body
    stance_data.Headline = result_headline
    
    return body_data, stance_data

In [None]:
# stemming 
def lemmatization(body_data, stance_data):
    result_body = []
    result_headline = []
    
    stemmer = PorterStemmer()
    
    for word_list in body_data.articleBody:
        sents = []
        for word in word_list:
            word_stem = WordNetLemmatizer().lemmatize(word, pos='v')
            #word_stem = stemmer.stem(word)
            sents.append(word_stem)
        result_body.append(sents)
        
    for word_list in stance_data.Headline:
        sents = []
        for word in word_list:
            word_stem = WordNetLemmatizer().lemmatize(word, pos='v')
            #word_stem = stemmer.stem(word)
            sents.append(word_stem)
        result_headline.append(sents)
        
    body_data.articleBody = result_body
    stance_data.Headline = result_headline
    
    return body_data, stance_data

In [None]:
#combine all the process
def preprocess(train_body, test_body, train_stance, test_stance, lemma=True):
    merged_bodies = merge(train_bodies, test_bodies)
    merged_stances = merge(train_stances, test_stances)
    merged_bodies, merged_stances = lower(merged_bodies, merged_stances)
    merged_bodies, merged_stances = tokenization_and_remove(merged_bodies, merged_stances)
    if lemma == True:  
        merged_bodies, merged_stances = lemmatization(merged_bodies, merged_stances)
    
    return merged_bodies, merged_stances

In [None]:
#get the word corpus of the body and headline
def get_corpus(body_data, stance_data):
    corpus_body = []
    corpus_headline = []
    
    for word_list in body_data.articleBody:
        corpus_body += word_list
        
    for word_list in stance_data.Headline:
        corpus_headline += word_list
        
    corpus = set(corpus_body + corpus_headline)
    #add OOV to deal with unseen word in test set
    corpus.add('OOV')
    
    word = corpus
    index = range(len(word))
    
    word_to_index_dict = dict(zip(word, index))
    index_to_word_dict = dict(zip(index, word))
    corpus = list(corpus)
    
    return corpus, word_to_index_dict, index_to_word_dict

In [None]:
#transfor data (string->integer or interger->string) according to the corpus dict
def transform(corpus, data):
    result = []
    for word_list in data:
        num_list = []
        for word in word_list:
            if word not in corpus:
                #use oov token to represent the unseen word in test set
                num_list.append(corpus['OOV'])
            num_list.append(corpus[word])
        result.append(num_list)
        
    return result

## utils

In [None]:
def body_headline_match(vector_body, vector_headline, bodies, stances, distance='cos'):
    #construct Dataframe for body and headline to merge the data on Body ID
    body = pd.DataFrame(list(range(vector_body.shape[0])), columns=['body_idx'])
    body['Body ID'] = bodies['Body ID']
    headline = pd.DataFrame(list(range(vector_headline.shape[0])), columns=['headline_idx'])
    headline['Body ID'] = stances['Body ID']
    headline['Stance'] = stances['Stance']

    #merge the body and the headline
    data = body_stance_join(body, headline)

    #get body index and headline index from the vector matrix
    body_idx = data['body_idx'].tolist()
    headline_idx = data['headline_idx'].tolist()

    #get the vector
    x_body = vector_body[body_idx]
    x_headline = vector_headline[headline_idx]

    #initial the matrix 
    distance_matrix = np.zeros(x_body.shape[0])
    
    if distance == 'cos':
        #compute the cosine similarity
        for i in range(distance_matrix.shape[0]):
            distance_matrix[i] = cosine_similarity(x_body[i], x_headline[i])

        #final data that contain index of body and headline and also the pair-wise cosine similarity
        data['distance_or_similarity'] = distance_matrix
    elif distance == 'eu':
        #compute the Euclidean distance
        for i in range(distance_matrix.shape[0]):
            distance_matrix[i] = eu_distance(x_body[i], x_headline[i])

        #final data that contain index of body and headline and also the pair-wise cosine similarity
        data['distance_or_similarity'] = distance_matrix
    
    elif distance == 'KL':
        for i in range(distance_matrix.shape[0]):
            distance_matrix[i] = KL_divergance(x_body[i], x_headline[i])
        
        #final data that contain index of body and headline and also the pair-wise KL divergance
        data['distance_or_similarity'] = distance_matrix
    
    return data

## TF-IDF

In [None]:
#function to get tf_idf matrix and the idf matrix(for test set)
#when count is False, we use the frequency as the value. Otherwise, we use the count as the value
def get_tf_idf_matrix(corpus, data, count=False):
    
    m = len(data)
    n = len(corpus)
    
    count_matrix = np.zeros([m, n])
    freq_matrix = np.zeros([m, n])
    
    for i in range(m):
        document = data.iloc[i]
        count_dict = Counter(document)
        index = list(count_dict.keys())
        value = np.array(list(count_dict.values()))
        count_matrix[i][index] = value
        freq_matrix[i][index] = value / len(document)
    
    frequency =  (freq_matrix > 0).sum(axis=0)
    idf = np.log10(np.divide(m, (1+frequency)))
    idf = np.log10(m / (1 + frequency))
    
    tf_idf = np.multiply(freq_matrix, idf)

    if count == True:  
        return count_matrix, idf, tf_idf
    elif count == False:
        return freq_matrix, idf, tf_idf

## distance/similarity measurement

In [None]:
def cosine_similarity(x1, x2):
    numerator = np.dot(x1.T, x2)
    dominator = np.linalg.norm(x1) * np.linalg.norm(x2)
    
    cos_similarity = numerator / dominator
    
    return cos_similarity

In [None]:
def eu_distance(x1, x2):
    return np.linalg.norm(x1 - x2)

In [None]:
def KL_divergance(document_model, query_model):
    KL = np.sum((query_model * np.log(query_model / document_model)))
    
    return KL

## word2vec

In [None]:
#function to get weighted(idf value)average word2vec based representation of document
def weighted_average_word2vec(data, embeddings, lookup_dict, idf_matrix, vec_len=512):
    
    vec = np.zeros([data.shape[0],vec_len])

    #word that not appear in embed or not english
    OOV = voc[1]
    
    for idx,document in enumerate(data):
        m = len(document)
        
        #element in the document are all numbers
        for num in document:
            #transform number to string according to num_to_word_dict
            word = lookup_dict[num]
            idf = idf_matrix[num]
            
            if word not in embed_dict:
                vec[idx] += embeddings[OOV] * idf
            else:
                vec[idx] += embeddings[word] * idf
    
        vec[idx] /= m
    
    return vec

In [None]:
#use tf-idf value to select the most n important word in a document
def tf_idf_based_search(tf_idf_matrix, n=3):
    m = tf_idf_matrix.shape[0]
    
    result = np.zeros([m,n])
    
    for i in range(m):
        #get n index with the largest tf-idf value in descending order(the first one is the largest one)
        sort_list = tf_idf_matrix[i].argsort()[-n:][::-1]
        result[i] = sort_list
        
    return result

In [None]:
def word_mover_distance(vector_body, vector_headline, bodies, stances, index_to_word_dict, n=3):
    body = pd.DataFrame(list(range(vector_body.shape[0])), columns=['body_idx'])
    body['Body ID'] = bodies['Body ID']
    headline = pd.DataFrame(list(range(vector_headline.shape[0])), columns=['headline_idx'])
    headline['Body ID'] = stances['Body ID']
    headline['Stance'] = stances['Stance']

    #merge the body and the headline
    data = body_stance_join(body, headline)
    
    distance = np.zeros(data.shape[0])
    
    #for words that not exist in embed_dict, treat it as OOV token(non-english vector in the embedding)
    OOV = voc[1]
    
    for i in range(data.shape[0]):
        
        #initialse the accumlate_similarity 
        accumulate_distance = 0
        
        #get the index of words in a body and a headline
        body_idx = data.body_idx.iloc[i]
        headline_idx = data.headline_idx.iloc[i]
        
        #get the list of word(integer) of the specify body and headline
        body_words = vector_body[body_idx]
        headline_words = vector_headline[headline_idx]
        
        
        
        #iterate n important words in the body
        for num1 in body_words:
            word1 = index_to_word_dict[num1]
            if word1 not in embed_dict:
                vec1 = embed_dict[OOV]
            else:
                vec1 = embed_dict[word1]
            
            record = np.zeros(n)
            
            #interate n important words in the headline
            for index,num2 in enumerate(headline_words):
                word2 = index_to_word_dict[num2]
                if word2 not in embed_dict:
                    vec2 = embed_dict[OOV]
                else:
                    vec2 = embed_dict[word2]  
                
                record[index] = eu_distance(vec1, vec2)
            
            # compute pair-wise cosine similarity, and select the largest one
            accumulate_distance += record.min()
        
        #avarage
        distance[i] = accumulate_distance
        
    data['distance_or_similarity'] = distance
        
    return data

## doc2vec

In [None]:
def get_doc2vec_document(train_body, train_headline, test_body, test_headline):
    #get the whole documents required for the genism doc2vec model
    document = []

    for i in range(train_body.shape[0]):
        document.append(LabeledSentence(train_body.articleBody.iloc[i], ['train_body_' + str(i)]))

    for i in range(train_stance.shape[0]):
        document.append(LabeledSentence(train_stance.Headline.iloc[i], ['train_headline_' + str(i)]))

    for i in range(test_body.shape[0]):
            document.append(LabeledSentence(test_body.articleBody.iloc[i], ['test_body_' + str(i)]))

    for i in range(test_stance.shape[0]):
        document.append(LabeledSentence(test_stance.Headline.iloc[i], ['test_headline_' + str(i)]))
        
    return document

## language model

In [None]:
#get background probability for words in corpus
def get_background(data, tf_matrix):
    
    m_document = tf_matrix.shape[0]
    background = np.zeros_like(tf_matrix)
    
    for i in range(m_document):
        background[i] = tf_matrix[i]
    
    background = background.sum(axis=0)
    
    return background

In [None]:
def get_language_model(data, background, tf_matrix, smoothing='dirchelet', lambd=None):
    
    background_prob = background / background.sum()
    m_document = len(data)
    m_corpus = len(background)
    
    model = np.zeros([m_document, m_corpus])
    
    if smoothing == 'dirchelet':
        u = background.sum() / len(data)
        for i in range(m_document):
            N = len(data.iloc[1])
            m_word = tf_matrix[i].sum()
            model[i] = N / (u + N) * tf_matrix[i] / m_word + u / (u + N) * background_prob
    
    if smoothing == 'jelinek':
        for i in range(m_document):
            m_word = tf_matrix[i].sum()
            model[i] = lambd * tf_matrix[i] / m_word + (1-lambd) * background_prob
        
    return model

# Load and merge the data

In [None]:
#load the data
train_bodies = pd.read_csv('fnc-1-master/train_bodies.csv')
train_stances = pd.read_csv('fnc-1-master/train_stances.csv')

test_bodies = pd.read_csv('fnc-1-master/competition_test_bodies.csv')
test_stances = pd.read_csv('fnc-1-master/competition_test_stances.csv')

In [None]:
train_data = body_stance_join(train_bodies, train_stances)
test_data = body_stance_join(test_bodies, test_stances)

# Load word2vec embedding

In [None]:
%%time
#load the pre-trained word embedding (might takes few minutes)
voc = [line.rstrip('\n') for line in open('word2vec_embedding/embd_voc')]
vec = np.loadtxt('word2vec_embedding/embd_vec')

#create the embedding dictionary
embed_dict = dict(zip(voc, vec))

# Task1: Split the training set

In [None]:
train, valid = train_val_split(train_data)

In [None]:
#plot the bar to show ratio of four different classes in training set and validation set.
plt.show()
valid.Stance.value_counts().plot.bar()
plt.show()
train.Stance.value_counts().plot.bar()
plt.show()

# Task2: Extract vector representation

## Bag-of-word and tf-idf based representation(Salton's vector space)

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process, to reduce the dimension of
#bag of word representation. I do the stemming to the text
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between index and word) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix, idf matrix and tf_idf matrix for body and headline
tf_body, idf_body, tf_idf_body = get_tf_idf_matrix(corpus, train_body.articleBody, count=False)
tf_headline, idf_headline, tf_idf_headline = get_tf_idf_matrix(corpus, train_stance.Headline, count=False)

In [None]:
%%time
#match the body and headline according the Body ID and compute pair-wise cosine similarity
data_method1 = body_headline_match(tf_idf_body, tf_idf_headline, train_bodies, train_stances)

In [None]:
#pair-wise cosine similarity distritbution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_method1[data_method1['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_method1[data_method1['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_method1[data_method1['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_method1[data_method1['Stance'] == 'disagree'].distance_or_similarity, label='disagree')
plt.show()

## word2vec and idf based representation

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
#for word2vec method, we don't have to do the stemming beacuse a words with different forms still have the likely vector representation
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix and idf matrix for body and headline
tf_body, idf_body, tf_idf_body = get_tf_idf_matrix(corpus, train_body.articleBody, count=False)
tf_headline, idf_headline, tf_idf_headline = get_tf_idf_matrix(corpus, train_stance.Headline, count=False)

In [None]:
%%time
vec_body = weighted_average_word2vec(train_body.articleBody, embed_dict, index_to_word_dict, idf_body)
vec_headline = weighted_average_word2vec(train_stance.Headline, embed_dict, index_to_word_dict, idf_headline)

In [None]:
%%time
#match the body and headline according the Body ID and compute pair-wise cosine similarity
data_method2 = body_headline_match(vec_body, vec_headline, train_bodies, train_stances)

In [None]:
#pair-wise cosine similarity distritbution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_method2[data_method2['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_method2[data_method2['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_method2[data_method2['Stance'] == 'disagree'].distance_or_similarity, label='disagree')
sns.kdeplot(data_method2[data_method2['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
plt.show()

## word2vec and tf-idf based representation and word mover's distance

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
#for word2vec method, we don't have to do the stemming beacuse a words with different forms still have the likely vector representation
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix and idf matrix for body and headline
tf_body, idf_body, tf_idf_body = get_tf_idf_matrix(corpus, train_body.articleBody, count=False)
tf_headline, idf_headline, tf_idf_headline = get_tf_idf_matrix(corpus, train_stance.Headline, count=False)

In [None]:
%%time
#extract n most important words from body and headline according to the tf-idf value
important_words_body = tf_idf_based_search(tf_idf_body, n=7)
important_words_headline = tf_idf_based_search(tf_idf_headline, n=7)

In [None]:
%%time
data_method3 = word_mover_distance(important_words_body, important_words_headline, train_bodies, train_stances, index_to_word_dict, n=7)

In [None]:
#pair-wise word mover's distance distribution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_method3[data_method3['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_method3[data_method3['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_method3[data_method3['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_method3[data_method3['Stance'] == 'disagree'].distance_or_similarity, label='disagree')

plt.show()

## doc2vec based representation

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
#for word2vec method, we don't have to do the stemming beacuse a words with different forms still have the likely vector representation
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_num_dict, num_to_word_dict = get_corpus(train_body, train_stance)

In [None]:
%%time
#get document of the doc2vec model
document = get_doc2vec_document(train_body, train_stance, test_body, test_stance)

#different parameters lead to different performance

#define the doc2vecm model
model = gensim.models.doc2vec.Doc2Vec(vector_size=128, alpha=.025, min_alpha=.01, dm=0, worker=8, dbow_words=1)
model.build_vocab(document)

#train the model
for epoch in range(20):
    model.train(document, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002  # decrease the learning rate`
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [None]:
#construct Dataframe for body and headline to merge the data on Body ID
body = pd.DataFrame(list(range(train_body.shape[0])), columns=['body_idx'])
body['Body ID'] = train_bodies['Body ID']
headline = pd.DataFrame(list(range(train_stance.shape[0])), columns=['headline_idx'])
headline['Body ID'] = train_stances['Body ID']
headline['Stance'] = train_stances['Stance']

#merge the body and the headline
data_method4 = body_stance_join(body, headline)

#get body index and headline index from the vector matrix
body_idx = data_method4['body_idx'].tolist()
headline_idx = data_method4['headline_idx'].tolist()

similarity = np.zeros(data_method4.shape[0])

for i in range(data_method4.shape[0]):
    similarity[i] = model.docvecs.similarity(d1='train_body_'+str(body_idx[i]), d2='train_headline_'+str(headline_idx[i]))

data_method4['distance_or_similarity'] = similarity

In [None]:
#pair-wise similarity distribution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_method4[data_method4['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_method4[data_method4['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_method4[data_method4['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_method4[data_method4['Stance'] == 'disagree'].distance_or_similarity, label='disagree')

plt.show()

# Task3: Language Model

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=False)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix for body and headline
tf_body, _, _ = get_tf_idf_matrix(corpus, train_body.articleBody, count=True)
tf_headline, _, _ = get_tf_idf_matrix(corpus, train_stance.Headline, count=True)

In [None]:
%%time
#get the background of body and headline
#As the corpus is obtained by combining the body and stance, in background_body, there are some wrods that not appear
#in the body article and their probability will be zeros
background_body = get_background(train_body['articleBody'], tf_body) + 1e-05
background_headline = get_background(train_stance['Headline'], tf_headline) + 1e-05

#get the language model of each individual body and headline
language_model_body = get_language_model(train_body['articleBody'], background_body, tf_body, smoothing='dirchelet')
language_model_headline = get_language_model(train_stance['Headline'], background_headline, tf_headline, smoothing='dirchelet')

In [None]:
%%time
#get the pair-wise KL divergance(might takes about 5 minutes to compute)
data_KL = body_headline_match(language_model_body, language_model_headline, train_bodies, train_stances, distance='KL')

In [None]:
#pair-wise similarity distribution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_KL[data_KL['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_KL[data_KL['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_KL[data_KL['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_KL[data_KL['Stance'] == 'disagree'].distance_or_similarity, label='disagree')

plt.show()

# Task4：Alternative Feature

## Topice Model

### LSA

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
#for word2vec method, we don't have to do the stemming beacuse a words with different forms still have the likely vector representation
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix for body and headline
tf_body, _, tf_idf_body = get_tf_idf_matrix(corpus, train_body.articleBody, count=True)
tf_headline, _, tf_idf_headline = get_tf_idf_matrix(corpus, train_stance.Headline, count=True)

In [None]:
#concat body and headline data
all_data = np.vstack([tf_body, tf_headline])

In [None]:
#implement svd
svd = TruncatedSVD(n_components=25, n_iter=10, random_state=0)
svd.fit(all_data)

In [None]:
#transform the body and headline
svd_body = svd.transform(tf_body)
svd_headline = svd.transform(tf_headline)

In [None]:
data_lda = body_headline_match(vector_body=svd_body, vector_headline=svd_headline, bodies=train_bodies, stances=train_stances, distance='cos')

In [None]:
#pair-wise cosine similarity distritbution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_lda[data_lda['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_lda[data_lda['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_lda[data_lda['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_lda[data_lda['Stance'] == 'disagree'].distance_or_similarity, label='disagree')
plt.show()

### LDA

In [None]:
%%time
#merge the train and test set for body and stance data set to better pre-process
#for word2vec method, we don't have to do the stemming beacuse a words with different forms still have the likely vector representation
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=False)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between string and integer) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(train_body, train_stance)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

#get tf matrix for body and headline
tf_body, _, tf_idf_body = get_tf_idf_matrix(corpus, train_body.articleBody, count=True)
tf_headline, _, tf_idf_headline = get_tf_idf_matrix(corpus, train_stance.Headline, count=True)

In [None]:
%%time
#LDA model
lda = LatentDirichletAllocation(n_topics=25, learning_method='batch', n_jobs=3, random_state=0)
lda_body = lda.fit_transform(tf_body)
lda_headline = lda.transform(tf_headline)

In [None]:
data_lda = body_headline_match(vector_body=lda_body, vector_headline=lda_headline, bodies=train_bodies, stances=train_stances, distance='cos')

In [None]:
#pair-wise cosine similarity distritbution of different stances
plt.figure(figsize=(15,10))
sns.kdeplot(data_lda[data_lda['Stance'] == 'unrelated'].distance_or_similarity, label='unrelated')
sns.kdeplot(data_lda[data_lda['Stance'] == 'discuss'].distance_or_similarity, label='discuss')
sns.kdeplot(data_lda[data_lda['Stance'] == 'agree'].distance_or_similarity, label='agree')
sns.kdeplot(data_lda[data_lda['Stance'] == 'disagree'].distance_or_similarity, label='disagree')
plt.show()

# Task5: Linear Model

## Linear Regression Model

In [None]:
class LinearRegression():
    
    def __init__(self, C=1, random_state=None, learning_rate=0.01, n_iteration=100, batch_size=100):
        self.C = C
        self.random_state = random_state
        self.learning_rate = learning_rate
        self.n_iteration = n_iteration
        self.batch_size = batch_size
        self.n_batch = 0
        self.w = 0
        self.b = 0
        
    def train(self, X, y):
        
        m = X.shape[0]
        n = X.shape[1]
        
        seed = self.random_state
        np.random.seed(seed)
        self.w = np.random.random([n,1])
        self.b = 0
        
        #construct the batch size
        self.n_batch = math.ceil(m / self.batch_size)
        batch = []
        for i in range(self.n_batch - 1):
            start = i * self.batch_size
            end = (i + 1) * self.batch_size
            batch.append(range(start, end))
            
        #last batch
        start = (self.n_batch - 1) * self.batch_size
        end = m
        batch.append(range(start, end))
        
        for i in range(self.n_iteration):
            current_loss = 0
            for j in range(self.n_batch):
                #get the batch
                x_train = X[batch[j]]
                y_train = y[batch[j]]
                size = x_train.shape[0]
                #forward
                fx = np.dot(x_train, self.w) + self.b
                #compute the loss
                delta = fx - y_train
                loss = 0.5 * np.dot(delta.T, delta) + 0.5 * self.C * np.sum(np.square(self.w))
                #compute the gradient
                dz = np.average(fx - y_train)
                gradient_w = 1. / size *(np.dot(x_train.T, (fx - y_train))) + self.C * self.w
                #gradient_w = np.average((x_train * (fx - y_train)), axis=0).reshape(-1,1) + self.C * self.w
                gradient_b = dz
                #update parameters
                self.w -= self.learning_rate * gradient_w
                self.b -= self.learning_rate * gradient_b
                current_loss += loss
            current_loss /= m
            print('n_iteration:' + str(i))
            print('current loss:' + str(current_loss))
            print('------------')
        
        return
    
    def fit(self, X, y):
        self.train(X, y)
        
    def predict(self, X):
        fx = np.dot(X, self.w) + self.b
        return fx

## Logistic Regression Model

In [None]:
class LogisticRegression():
    
    def __init__(self, C=1, random_state=None, learning_rate=0.01, n_iteration=100, batch_size=100, multi_class=False):
        self.C = C
        self.random_state = random_state
        self.learning_rate = learning_rate
        self.n_iteration = n_iteration
        self.batch_size = batch_size
        self.multi_class = multi_class
        self.n_batch = 0
        self.w = []
        self.b = []
        
    def sigmoid(self, x):
        return 1. / (1 + np.exp(-x))
    
    def softmax(self,x):
        orig_shape = x.shape

        if len(x.shape) > 1:
            # Matrix
            exp_minmax = lambda x: np.exp(x - np.max(x))
            denom = lambda x: 1.0 / np.sum(x)
            x = np.apply_along_axis(exp_minmax, 1, x)
            denominator = np.apply_along_axis(denom, 1, x)

            if len(denominator.shape) == 1:
                denominator = denominator.reshape((denominator.shape[0], 1))

            x = x * denominator
        else:
            # Vector
            x_max = np.max(x)
            x = x - x_max
            numerator = np.exp(x)
            denominator = 1.0 / np.sum(numerator)
            x = numerator.dot(denominator)

        assert x.shape == orig_shape
        return x
    
    def train(self, X, y):
        
        m = X.shape[0]
        n = X.shape[1]
        
        #initialize the weight and bias
        W = np.zeros([n, 1])
        B = 0
        
        #construct the batch size
        self.n_batch = math.ceil(m / self.batch_size)
        batch = []
        for i in range(self.n_batch - 1):
            start = i * self.batch_size
            end = (i + 1) * self.batch_size
            batch.append(range(start, end))
            
        #last batch
        start = (self.n_batch - 1) * self.batch_size
        end = m
        batch.append(range(start, end))
            
        for i in range(self.n_iteration):
            current_loss = 0
            for j in range(self.n_batch):
                #get the batch
                x_train = X[batch[j]]
                y_train = y[batch[j]]
                size = x_train.shape[0]
                #forward
                fx = np.dot(x_train, W) + B
                z = self.sigmoid(fx)
                #compute loss
                loss = 0.5 * ((-y_train) * np.log(z) - (1-y_train) * np.log(1-z)).sum() + 0.5 * self.C * np.sum(np.square(W))
                #compute the gradient
                dz = np.average(z - y_train)
                #gradient_W = 1. / size * (np.dot(x_train.T, (z - y_train))) + self.C * W
                gradient_W = (np.average((x_train * (z - y_train)), axis=0)).reshape(-1,1) + self.C * W
                gradient_B = dz
                #update parameters
                W -= self.learning_rate * gradient_W
                B -= self.learning_rate * gradient_B
                #update loss
                current_loss += loss
            current_loss /= m
            print('n_iteration:' + str(i))
            print('current loss:' + str(current_loss))
            print('------------')
            
        return W, B
            
    def fit(self, X, y):
        #reset the learning parameter
        self.w = []
        self.b = []
        if self.multi_class == False:
            y = y.reshape(-1,1)
            W, B = self.train(X,y)
            self.w.append(W)
            self.b.append(B)
        #for multiclass classification
        elif self.multi_class == True:
            n_class = y.shape[1]
            #run one-vs-all algorithm and record the parameters of each classifier
            for i in range(n_class):
                W, B = self.train(X, y[:,i].reshape(-1,1))
                self.w.append(W)
                self.b.append(B)
    
    def predict_prob(self, X):
        if self.multi_class == False:
            weight = self.w[0]
            bias = self.b[0]
            fx = np.dot(X, weight) + bias
            hx = self.sigmoid(fx)
            pred = hx
        
        #get prediction of different classifiers
        elif self.multi_class == True:
            n_class = len(self.w)
            pred = []
            for i in range(n_class):
                weight = self.w[i]
                bias = self.b[i]
                fx = np.dot(X, weight) + bias
                hx = self.sigmoid(fx)
                pred.append(hx)
            
            #apply softmax to get the final prediction
            pred = np.hstack(pred)
            pred = self.softmax(pred)
            
        return pred
    
    def predict(self, X):
        pred_prob = self.predict_prob(X)
        if self.multi_class == False:
            pred = (pred_prob > 0.5) * 1.0
        elif self.multi_class == True:
            pred = np.zeros_like(pred_prob)
            max_index = pred_prob.argmax(axis=1)
            for i in range(X.shape[0]):
                pred[i][max_index[i]] = 1
            
        return pred
    
    def get_parameters(self):
        return self.w, self.b

## Get final representation

In [None]:
%%time
#might take 8-10 minu
#-----------------preprocess-------------------------
#merge the train and test set for body and stance data set to better pre-process, to reduce the dimension of
#bag of word representation. I do the stemming to the text
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=True)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between index and word) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(merged_bodies, merged_stances)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

test_body['articleBody'] = transform(word_to_index_dict, test_body.articleBody)
test_stance['Headline'] = transform(word_to_index_dict, test_stance.Headline)

#get tf matrix, idf matrix and tf_idf matrix for body and headline
tf_body_train, idf_body_train, tf_idf_body_train = get_tf_idf_matrix(corpus, train_body.articleBody, count=False)
tf_headline_train, idf_headline_train, tf_idf_headline_train = get_tf_idf_matrix(corpus, train_stance.Headline, count=False)

tf_body_test, idf_body_test, tf_idf_body_test = get_tf_idf_matrix(corpus, test_body.articleBody, count=False)
tf_headline_test, idf_headline_test, tf_idf_headline_test = get_tf_idf_matrix(corpus, test_stance.Headline, count=False)

print('preprocess finished')


#-----------------bag-of-word-feature-------------------------
data_method1_train = body_headline_match(tf_idf_body_train, tf_idf_headline_train, train_bodies, train_stances)
data_method1_test = body_headline_match(tf_idf_body_test, tf_idf_headline_test, test_bodies, test_stances)

print('bag of word feature finished')


#-----------------word2vec-representation1-------------------------
word2vec_1_body_train = weighted_average_word2vec(train_body.articleBody, embed_dict, index_to_word_dict, idf_body_train)
word2vec_1_headline_train = weighted_average_word2vec(train_stance.Headline, embed_dict, index_to_word_dict, idf_headline_train)
word2vec_1_body_test = weighted_average_word2vec(test_body.articleBody, embed_dict, index_to_word_dict, idf_body_test)
word2vec_1_headline_test = weighted_average_word2vec(test_stance.Headline, embed_dict, index_to_word_dict, idf_headline_test)

#-----------------word2vec-feature1-------------------------
data_method2_train = body_headline_match(word2vec_1_body_train, word2vec_1_headline_train, train_bodies, train_stances)
data_method2_test = body_headline_match(word2vec_1_body_test, word2vec_1_headline_test, test_bodies, test_stances)
# feature_word2vec_1_cosine_similarity = data_method2.distance_or_similarity.values

print('word2vec feature1 finished')

#-----------------word2vec-representation2-------------------------
#extract n most important words from body and headline according to the tf-idf value
important_words_body_train = tf_idf_based_search(tf_idf_body_train, n=5)
important_words_headline_train = tf_idf_based_search(tf_idf_headline_train, n=5)
important_words_body_test = tf_idf_based_search(tf_idf_body_test, n=5)
important_words_headline_test = tf_idf_based_search(tf_idf_headline_test, n=5)


#-----------------word2vec-feature2-------------------------
data_method3_train = word_mover_distance(important_words_body_train, 
                                         important_words_headline_train, 
                                         train_bodies, 
                                         train_stances, 
                                         index_to_word_dict, 
                                         n=5)

data_method3_test = word_mover_distance(important_words_body_test, 
                                        important_words_headline_test, 
                                        test_bodies, 
                                        test_stances, 
                                        index_to_word_dict, 
                                        n=5)

print('word2vec feature2 finished')

#-----------------doc2vec--------------------------

train_body['articleBody'] = transform(index_to_word_dict, train_body.articleBody)
train_stance['Headline'] = transform(index_to_word_dict, train_stance.Headline)
test_body['articleBody'] = transform(index_to_word_dict, test_body.articleBody)
test_stance['Headline'] = transform(index_to_word_dict, test_stance.Headline)
#doc2vec representation and cosine similarity feature
#get document of the doc2vec model
document = get_doc2vec_document(train_body, train_stance, test_body, test_stance)

#different parameters lead to different performance

#define the doc2vecm model
model = gensim.models.doc2vec.Doc2Vec(vector_size=128, alpha=.025, min_alpha=.01, dm=0, worker=8, dbow_words=1)
model.build_vocab(document)

#train the model
for epoch in range(20):
    model.train(document, total_examples=model.corpus_count, epochs=1)
    model.alpha -= 0.002  # decrease the learning rate`
    model.min_alpha = model.alpha  # fix the learning rate, no decay

#training data
#construct Dataframe for body and headline to merge the data on Body ID
body_train = pd.DataFrame(list(range(train_body.shape[0])), columns=['body_idx'])
body_train['Body ID'] = train_bodies['Body ID']
headline_train = pd.DataFrame(list(range(train_stance.shape[0])), columns=['headline_idx'])
headline_train['Body ID'] = train_stances['Body ID']
headline_train['Stance'] = train_stances['Stance']

#merge the body and the headline
data_method4_train = body_stance_join(body_train, headline_train)

#get body index and headline index from the vector matrix
body_idx = data_method4_train['body_idx'].tolist()
headline_idx = data_method4_train['headline_idx'].tolist()

similarity = np.zeros(data_method4_train.shape[0])

for i in range(data_method4_train.shape[0]):
    similarity[i] = model.docvecs.similarity(d1='train_body_'+str(body_idx[i]), 
                                             d2='train_headline_'+str(headline_idx[i]))
data_method4_train['distance_or_similarity'] = similarity


#test set
body_test = pd.DataFrame(list(range(test_body.shape[0])), columns=['body_idx'])
body_test['Body ID'] = test_bodies['Body ID']
headline_test = pd.DataFrame(list(range(test_stance.shape[0])), columns=['headline_idx'])
headline_test['Body ID'] = test_stances['Body ID']
headline_test['Stance'] = test_stances['Stance']

#merge the body and the headline
data_method4_test = body_stance_join(body_test, headline_test)

#get body index and headline index from the vector matrix
body_idx = data_method4_test['body_idx'].tolist()
headline_idx = data_method4_test['headline_idx'].tolist()

similarity = np.zeros(data_method4_test.shape[0])

for i in range(data_method4_test.shape[0]):
    similarity[i] = model.docvecs.similarity(d1='test_body_'+str(body_idx[i]), 
                                             d2='test_headline_'+str(headline_idx[i]))
data_method4_test['distance_or_similarity'] = similarity



train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)
test_body['articleBody'] = transform(word_to_index_dict, test_body.articleBody)
test_stance['Headline'] = transform(word_to_index_dict, test_stance.Headline)

print('doc2vec finished')


# #-----------------language-model-------------------------
# #language model representation and KL divergance feature

# #get the background of body and headline
# #As the corpus is obtained by combining the body and stance, in background_body, there are some wrods that not appear
# #in the body article and their probability will be zeros
# background_body_train = get_background(train_body['articleBody'], tf_body_train) + 1e-05
# background_headline_train = get_background(train_stance['Headline'], tf_headline_train) + 1e-05
# background_body_test = get_background(test_body['articleBody'], tf_body_test) + 1e-05
# background_headline_test = get_background(test_stance['Headline'], tf_headline_test) + 1e-05

# #get the language model of each individual body and headline
# language_model_body_train = get_language_model(train_body['articleBody'], background_body_train, tf_body_train, smoothing='dirchelet')
# language_model_headline_train = get_language_model(train_stance['Headline'], background_headline_train, tf_headline_train, smoothing='dirchelet')
# language_model_body_test = get_language_model(test_body['articleBody'], background_body_test, tf_body_test, smoothing='dirchelet')
# language_model_headline_test = get_language_model(test_stance['Headline'], background_headline_test, tf_headline_test, smoothing='dirchelet')

# #get the pair-wise KL divergance(might takes about 5 minutes to compute)
# data_KL_train = body_headline_match(language_model_body_train, language_model_headline_train, train_bodies, train_stances, distance='KL')
# data_KL_test = body_headline_match(language_model_body_test, language_model_headline_test, test_bodies, test_stances, distance='KL')

# print('language model finished')


#-----------------LDA-model-------------------------

#-----------------different-preprocess-------------------------
merged_bodies, merged_stances = preprocess(train_bodies, train_stances, test_bodies, test_stances, lemma=False)

#split the train the test set so that we can construct feature from the training set
train_body, test_body = train_test_split(merged_bodies, train_bodies, test_bodies, dtype='body')
train_stance, test_stance = train_test_split(merged_stances, train_stances, test_stances, dtype='stance')

# get the corpus(mapping between index and word) for all the data set
corpus, word_to_index_dict, index_to_word_dict = get_corpus(merged_bodies, merged_stances)

#transform the document from string to integer
train_body['articleBody'] = transform(word_to_index_dict, train_body.articleBody)
train_stance['Headline'] = transform(word_to_index_dict, train_stance.Headline)

test_body['articleBody'] = transform(word_to_index_dict, test_body.articleBody)
test_stance['Headline'] = transform(word_to_index_dict, test_stance.Headline)

#get tf matrix, idf matrix and tf_idf matrix for body and headline
tf_body_train, idf_body_train, tf_idf_body_train = get_tf_idf_matrix(corpus, train_body.articleBody, count=True)
tf_headline_train, idf_headline_train, tf_idf_headline_train = get_tf_idf_matrix(corpus, train_stance.Headline, count=True)

tf_body_test, idf_body_test, tf_idf_body_test = get_tf_idf_matrix(corpus, test_body.articleBody, count=True)
tf_headline_test, idf_headline_test, tf_idf_headline_test = get_tf_idf_matrix(corpus, test_stance.Headline, count=True)




#topic model LDA feature
lda = LatentDirichletAllocation(n_topics=25, learning_method='batch', n_jobs=3, random_state=0)
lda_body_train = lda.fit_transform(tf_body_train)
lda_body_test = lda.transform(tf_body_test)
lda_headline_train = lda.transform(tf_headline_train)
lda_headline_test = lda.transform(tf_headline_test)

data_lda_train = body_headline_match(vector_body=lda_body_train, vector_headline=lda_headline_train, bodies=train_bodies, stances=train_stances, distance='cos')
data_lda_test = body_headline_match(vector_body=lda_body_test, vector_headline=lda_headline_test, bodies=test_bodies, stances=test_stances, distance='cos')

print('LDA finished')

In [None]:
def data_match(body, headline, bodies, stances):
    data_body = pd.DataFrame(body)
    data_headline = pd.DataFrame(headline)
    
    data_body['Body ID'] = bodies['Body ID']
    data_headline['Body ID'] = stances['Body ID']
    data_headline['Stance'] = stances['Stance']
    data_headline['headline_idx'] = data_headline.index
    
    merged = body_stance_join(data_body, data_headline)
    merged = merged.drop(['Body ID'], axis=1)
    
    return merged

In [None]:
def feature_match(data, feature):
    feature = feature.drop(['body_idx', 'Body ID', 'Stance'], axis=1)
    merged = pd.merge(data, feature, on='headline_idx')
    
    return merged

In [None]:
#select n most frequent word(highest average tf-idf value)and use it as the vector representation
def select_n_words(vec_body, vec_headline, n=5000):
    data = np.vstack([vec_body, vec_headline])
    data = np.sum(data, axis=0)
    idx_list = data.argsort()[::-1][:n]
    
    vec_body = vec_body.T
    vec_headline = vec_headline.T
    
    vec_body = vec_body[idx_list]
    vec_headline = vec_headline[idx_list]
    
    return vec_body.T, vec_headline.T

In [6]:
def x_y_split(data):
    x = data.drop(['Stance'], axis=1)
    y = data.Stance
    
    y_agree = y.map(lambda x: x == 'agree').values.reshape(-1,1)
    y_disagree = y.map(lambda x: x == 'disagree').values.reshape(-1,1)
    y_discuss = y.map(lambda x: x == 'discuss').values.reshape(-1,1)
    y_unrelated = y.map(lambda x: x == 'unrelated').values.reshape(-1,1)
    
    y = np.hstack([y_agree, y_disagree, y_discuss, y_unrelated])
    
    return x.values, y

In [None]:
BOW_body_train = tf_idf_body_train
BOW_headline_train = tf_idf_headline_train

BOW_body_test = tf_idf_body_test
BOW_headline_test = tf_idf_headline_test

BOW_body_train, BOW_headline_train = select_n_words(BOW_body_train, BOW_headline_train, 5000)
BOW_body_test, BOW_headline_test = select_n_words(BOW_body_test, BOW_headline_test, 5000)

In [None]:
vector_body_train = lda_body_train
vector_headline_train = lda_headline_train

vector_body_test = lda_body_test
vector_headline_test = lda_headline_train


data_train = data_match(vector_body_train, vector_headline_train, train_bodies, train_stances)
data_train = feature_match(data_train, data_method1_train)
data_train = feature_match(data_train, data_method2_train)
data_train = feature_match(data_train, data_method3_train)
data_train = feature_match(data_train, data_method4_train)
data_train = feature_match(data_train, data_lda_train)


data_test = a = data_match(vector_body_test, vector_headline_test, test_bodies, test_stances)
data_test = feature_match(data_test, data_method1_test)
data_test = feature_match(data_test, data_method1_test)
data_test = feature_match(data_test, data_method1_test)
data_test = feature_match(data_test, data_method1_test)
data_test = feature_match(data_test, data_lda_test)


#drop the headline_idx columns
data_train = data_train.drop(['headline_idx'], axis=1)
data_test = data_test.drop(['headline_idx'], axis=1)

In [None]:
# data_train = pd.read_csv('lda_train.csv', index_col=0)
# data_test = pd.read_csv('lda_test.csv', index_col=0)

In [None]:
train, val = train_val_split(data_train)

x_train, y_train = x_y_split(train)
x_val, y_val = x_y_split(val)
x_test, y_test = x_y_split(data_test)

## Evaluate Function

In [33]:
def accuracy(y_pred, y_true):
    
    m_sample = y_true.shape[0]
    n_class = y_true.shape[1]
    
    temp = (y_pred == y_true).sum(axis=1)
    acc = (temp == n_class).sum() / m_sample
    
    return acc

In [34]:
def confusion_matrix(y_pred, y_true):
    def change(y):
        m = y.shape[0]
        y_label = []
        for i in range(m):
            idx = y[i].argmax()
            if idx == 0:
                y_label.append('agree')
            elif idx == 1:
                y_label.append('disagree') 
            elif idx == 2:
                y_label.append('discuss')
            elif idx == 3:
                y_label.append('unrelated')

        return y_label
    
    y_pred_label = change(y_pred)
    y_true_label = change(y_true)
    
    cfm = CFM(y_pred=y_pred_label, y_true=y_true_label)
    
    return cfm

## Linear Regression train and evaluate

In [None]:
def fit_predict_linear(x, y, x_val):
    model = LinearRegression()
    model.fit(x,y)
    return model.predict(x_val)

In [30]:
def label_predict(y):
    y_pred = np.zeros_like(y)
    for i in range(y.shape[0]):
        y_pred[i][y[i].argmax()] = 1
    
    return y_pred

In [None]:
#get prediction of each stance class
y_agree = fit_predict_linear(x_train, y_train[:,0], x_test).reshape(-1,1)
y_disagree = fit_predict_linear(x_train, y_train[:,1], x_test).reshape(-1,1)
y_discuss = fit_predict_linear(x_train, y_train[:,2], x_test).reshape(-1,1)
y_unrelated = fit_predict_linear(x_train, y_train[:,3], x_test).reshape(-1,1)

#stack the prediction
y_pred = np.hstack([y_agree1, y_disagree, y_discuss, y_unrelated])

#change the prediction to predicted label
y_pred = label_predict(y_pred)

In [None]:
#get confusion matrix and accuracy
cfm_linear = confusion_matrix(y_pred=y_pred, y_true=y_test)
accuracy_linear = accuracy(y_pred, y_test)
auc_linear = sklearn.metrics.roc_auc_score(y_score=y_pred, y_true=y_test)

## Logistic Regression train and evaluate

In [None]:
def softmax(x):
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax, 1, x)
        denominator = np.apply_along_axis(denom, 1, x)

        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0], 1))

        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator = 1.0 / np.sum(numerator)
        x = numerator.dot(denominator)

    assert x.shape == orig_shape
    return x

In [None]:
def fit_predict_logistic(x, y, x_val):
    model = LogisticRegression(C=0.1)
    model.fit(x,y)
    return model.predict_proba(x_val)[:,1]

In [None]:
#get prediction of each stance
y_agree = predict_label(x_train, y_train[:,0], x_test).reshape(-1,1)
y_disagree = predict_label(x_train, y_train[:,1], x_test).reshape(-1,1)
y_discuss = predict_label(x_train, y_train[:,2], x_test).reshape(-1,1)
y_unrelated = predict_label(x_train, y_train[:,3], x_test).reshape(-1,1)

#stack the prediction
y_pred = np.hstack([y_agree, y_disagree, y_discuss, y_unrelated])

#apply softmax funcition
y_pred = softmax(y_pred)

#get the predicted label
y_pred = label_predict(y_pred)

In [None]:
#get the final result
cfm_logstic = confusion_matrix(y_pred=y_pred, y_true=y_test)
accuracy_logistic = accuracy(y_pred, y_test)
auc_logistic = sklearn.metrics.roc_auc_score(y_score=y_pred, y_true=y_test)

# Deep Model

In [474]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LeakyReLU
import keras

In [139]:
input_dim = x_train.shape[1]
n_class = y_train.shape[1]

In [578]:
# keras MLP mode
model = Sequential()

#layers
model.add(Dense(128, input_dim=input_dim))
model.add(Activation('relu'))
model.add(Dropout(0.6))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(n_class))
model.add(Activation('softmax'))

#complie
model.compile(loss='categorical_crossentropy', optimizer='Nadam')

#fit model
model.fit(x_train, y_train, nb_epoch=6, batch_size=128, validation_data=(x_test, y_test))



Train on 49972 samples, validate on 25413 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x162decf60>

In [719]:
pred_mlp = model.predict(x_test)
y_pred = label_predict(pred_mlp)

#get confusion matrix and accuracy
cfm = confusion_matrix(y_pred=y_pred, y_true=y_test)
accuracy = sklearn.metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
auc = sklearn.metrics.roc_auc_score(y_score=y_pred, y_true=y_test)

In [720]:
#accuract
accuracy

0.8697517018848621

In [721]:
#confusion matrix
#1:Agree
#2:disagree
#3:discuss
#4:unrelated
cfm

array([[  141,     0,  1626,   136],
       [   71,     0,   490,   136],
       [  189,     0,  3980,   295],
       [   20,     0,   347, 17982]])