In [163]:
import corenlp
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import sys
from gensim.models import KeyedVectors
import os
from cleantext import clean
import pprint
import sys
import itertools
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from string import punctuation
from autocorrect import spell
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from nltk.tokenize import sent_tokenize


In [164]:
train_data_body = pd.read_csv('data/train_bodies.csv')
train_data_stance = pd.read_csv('data/train_stances.csv')

train_article_id = train_data_body['Body ID']
train_stance_id = train_data_stance['Body ID']

train_article_body = train_data_body['articleBody']
train_labels = train_data_stance['Stance']
train_headlines = train_data_stance['Headline']

test_data_body = pd.read_csv('data/competition_test_bodies.csv')
test_data_stance = pd.read_csv('data/competition_test_stances.csv')

test_article_id = test_data_body['Body ID']
test_stance_id = test_data_stance['Body ID']

test_article_body = test_data_body['articleBody']
test_labels = test_data_stance['Stance']
test_headlines = test_data_stance['Headline']

In [6]:
def mean_vectorizor(model, tokenized_sent, dim):
    
    return np.array([
            np.mean([model[w] for w in words if w in model.vocab]
                    or [np.zeros(dim)], axis=0)
            for words in tokenized_sent
        ])
model = KeyedVectors.load_word2vec_format('../word_embedings/GoogleNews-vectors-negative300.bin', binary=True)

In [165]:
def get_tokenized_body_para1(text, dim=5):
    
    tokenized_sent = sent_tokenize(text)
    body_text = tokenized_sent[0:dim]
    bt = []
    [bt.extend(b.split('.')[:-1]) for b in body_text]
    return bt

def get_ner_tags(news_articles):
    
    #ner_tags = dict()
    ner_tags = []
    os.environ['CORENLP_HOME'] = '/home/ahaque2/project/virtual_environment_1/stanfordNLP'
    with corenlp.CoreNLPClient(annotators="tokenize ssplit pos lemma ner depparse".split()) as client:

        for sent in news_articles:
            ann = client.annotate(sent)
            sentence = ann.sentence
            tokens = sentence[0].token
            for tok in tokens:
                if(tok.ner != 'O'):
                    ner_tags.append(tok.word)
                    '''
                    if tok.ner in ner_tags:
                        ner_tags[tok.ner].append(tok.word)
                    else:
                        ner_tags[tok.ner] = []
                        ner_tags[tok.ner].append(tok.word)
                    '''
        #print(i, end=" ")
    #print(ner_tags)
    return ner_tags
   

In [166]:
#Code for Text preprocessing
def autospell(text):
    """
    correct the spelling of the word.
    """
    spells = [spell(w) for w in (nltk.word_tokenize(text))]
    return " ".join(spells)

def to_lower(text):
    """
    :param text:
    :return:
        Converted text to lower case as in, converting "Hello" to "hello" or "HELLO" to "hello".
    """
    return text.lower()

def remove_numbers(text):
    """
    take string input and return a clean text without numbers.
    Use regex to discard the numbers.
    """
    output = ''.join(c for c in text if not c.isdigit())
    return output

def remove_punct(text):
    """
    take string input and clean string without punctuations.
    use regex to remove the punctuations.
    """
    return ''.join(c for c in text if c not in punctuation)

def remove_Tags(text):
    """
    take string input and clean string without tags.
    use regex to remove the html tags.
    """
    cleaned_text = re.sub('<[^<]+?>', '', text)
    return cleaned_text

def sentence_tokenize(text):
    """
    take string input and return list of sentences.
    use nltk.sent_tokenize() to split the sentences.
    """
    sent_list = []
    for w in nltk.sent_tokenize(text):
        sent_list.append(w)
    return sent_list

def word_tokenize(text):
    """
    :param text:
    :return: list of words
    """
    return [w for sent in nltk.sent_tokenize(text) for w in nltk.word_tokenize(sent)]

def remove_stopwords(sentence):
    """
    removes all the stop words like "is,the,a, etc."
    """
    stop_words = stopwords.words('english')
    return ' '.join([w for w in nltk.word_tokenize(sentence) if not w in stop_words])

def stem(text):
    """
    :param word_tokens:
    :return: list of words
    """
    
    snowball_stemmer = SnowballStemmer('english')
    stemmed_word = [snowball_stemmer.stem(word) for sent in nltk.sent_tokenize(text)for word in nltk.word_tokenize(sent)]
    return " ".join(stemmed_word)

def lemmatize(text):
    
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word)for sent in nltk.sent_tokenize(text)for word in nltk.word_tokenize(sent)]
    return " ".join(lemmatized_word)


def preprocess(text):

    lower_text = to_lower(text)
    sentence_tokens = sentence_tokenize(lower_text)
    word_list = []
    for each_sent in sentence_tokens:
        lemmatizzed_sent = lemmatize(each_sent)
        clean_text = remove_numbers(lemmatizzed_sent)
        clean_text = remove_punct(clean_text)
        clean_text = remove_Tags(clean_text)
        clean_text = remove_stopwords(clean_text)
        word_tokens = word_tokenize(clean_text)
        for i in word_tokens:
            word_list.append(i)
    return word_list

In [167]:
def get_features(article_id, article_body, stance_id):
    df = pd.DataFrame(columns = ['stance_id', 'similarity', 'label'])
    for bid, txt in zip(article_id, article_body):
        index = np.where(stance_id == bid)[0]
        article = get_tokenized_body_para1(txt)
        lab = labels.iloc[index]
        heads = headlines.iloc[index]
        heads_tokenized = [preprocess(h) for h in heads]

        article = list(itertools.chain.from_iterable([preprocess(a) for a in article]))
        article_vec = mean_vectorizor(model, [article] , 300)

        head_vec = mean_vectorizor(model, heads_tokenized, 300)
    
        similarity = []
        from scipy import spatial
        for h in head_vec:
            similarity.append(1 - spatial.distance.cosine(h, article_vec))
        
        df2 = pd.DataFrame(columns = ['stance_id', 'similarity', 'label'])
        df2['stance_id'] = index
        df2['similarity'] = similarity
        df2['label'] = np.array(lab)

        #print(df2.shape)
        df = df.append(df2, ignore_index = True)
        #print(df.shape)
        #sys.exit()

    return df

In [168]:
df_train = get_features(train_article_id, train_article_body, train_stance_id)
df_test = get_features(test_article_id, test_article_body, test_stance_id)
print(df_train.shape, df_test.shape)

  dist = 1.0 - uv / np.sqrt(uu * vv)


(49972, 3) (25413, 3)


In [186]:
def get_Xy(df):
    
    X = df['similarity']
    y = df['label']
    #print(X.shape, y.shape)
    i = np.where(X.isna() == True)[0]
    df = df.drop(i)
    X = df['similarity']
    y = df['label']
    
    '''
    y = y.replace('unrelated', 1)
    y = y.replace('agree', 0)
    y = y.replace('discuss', 0)
    y = y.replace('disagree', 0)
    '''
    
    y = np.array(y).reshape(-1,1)
    X = np.array(X).reshape(-1,1)
    return X, y


In [204]:
X_train, y_train = get_Xy(df_train)
X_test, y_test = get_Xy(df_test)
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))

from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = MultinomialNB()
clf = neighbors.KNeighborsClassifier(9)

#clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

{'disagree': 446, 'agree': 1840, 'discuss': 4528, 'unrelated': 18537}


  


In [205]:
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

{'disagree': 446, 'agree': 1840, 'discuss': 4528, 'unrelated': 18537}
              precision    recall  f1-score   support

       agree       0.00      0.04      0.01       165
    disagree       0.00      0.00      0.00         5
     discuss       0.04      0.21      0.06       765
   unrelated       0.96      0.73      0.83     24416

    accuracy                           0.71     25351
   macro avg       0.25      0.25      0.22     25351
weighted avg       0.93      0.71      0.80     25351

[[    7     2    40   116]
 [    0     0     1     4]
 [   51     9   160   545]
 [ 1782   435  4327 17872]]
