# ATTN

"ATTN is designed to analyze news statement simply from semantic perspective. To build an explainable model for semantic analysis with good performance, we employ several techniques, including pre-trained word embedding, convolutional neural network ..., and self-attention mechanism .... Self-attention is used because it can capture global relationships between dif- ferent words efficiently. In addition, the weight matrix generated in attention mechanism is input dependent, which helps provide instance-level explanation." (Yang, Pentyala, Mohseni, Du, Yuan, Linder, Ragan, Ji & Hu, 2019)

Yang, F., Pentyala, S. K., Mohseni, S., Du, M., Yuan, H., Linder, R., Ragan, E. D., Ji, S. & Hu, X. (2019). XFake: explainable fake news detector with visualizations. In The World Wide Web Conference (pp. 3600-3604).

## Import packages

In [1]:
import re
import os
import spacy
import pandas as pd
import numpy as np
from unidecode import unidecode
import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

In [2]:
# import English language model
nlp = spacy.load("/opt/anaconda3/lib/python3.8/site-packages/en_core_web_sm/en_core_web_sm-2.3.1")

## Import data

In [3]:
# import data
df = pd.read_table("data.csv", delimiter = ",",)

In [4]:
# select only what we need
df = df[['label', 'statement']]

In [5]:
# show dataframe
df.head()

Unnamed: 0,label,statement
0,false,Says the Annies List political group supports ...
1,half-true,When did the decline of coal start? It started...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
3,false,Health care reform legislation is likely to ma...
4,half-true,The economic turnaround started at the end of ...


In [6]:
# labels to binary (int)
df['label'] = df['label'].replace(['mostly-true','half-true', 'barely-true', 'no-flip', 'half-flip', 'true'], 1)
df['label'] = df['label'].replace(['mostly-false','pants-fire', 'full-flop', 'false'], 0)
df['label'] = df['label'].astype('int')

In [7]:
# drop nan
df = df.dropna()

In [8]:
# show dataframe
df.head()

Unnamed: 0,label,statement
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,1,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...


## Pre-processing statement and context

* Remove extra whitespaces
* Convert accented characters to ASCII characters
* Expand contractions
* Expand abbreviations
* Remove special characters
* Remove numbers
* Lowercase all texts
* Remove stopwords
* Lemmatization

In [9]:
# Remove extra whitespaces
df['statement'] = [' '.join(statement.split()) for statement in df['statement']]

In [10]:
# Convert accented characters to ASCII characters
df['statement'] = [unidecode(statement) for statement in df['statement']]

In [11]:
# Expand contractions

def expand_contractions(text):
    
    flags = re.IGNORECASE | re.MULTILINE
    
    text = re.sub(r'`', "'", text, flags = flags)
    text = re.sub(r"(\s|^)'(aight|cause)(\s|$)", '\g<1>\g<2>\g<3>', text, flags = flags)
    text = re.sub(r"(\s|^)'t(was|is)(\s|$)", r'\g<1>it \g<2>\g<3>', text, flags = flags)
    text = re.sub(r"(\s|^)ol'(\s|$)", '\g<1>old\g<2>', text, flags = flags)
    
    ## expand words without
    text = re.sub(r"\b(aight)\b", 'alright', text, flags = flags)
    text = re.sub(r'\bcause\b', 'because', text, flags = flags)
    text = re.sub(r'\b(finna|gonna)\b', 'going to', text, flags = flags)
    text = re.sub(r'\bgimme\b', 'give me', text, flags = flags)
    text = re.sub(r"\bgive'n\b", 'given', text, flags = flags)
    text = re.sub(r"\bhowdy\b", 'how do you do', text, flags = flags)
    text = re.sub(r"\bgotta\b", 'got to', text, flags = flags)
    text = re.sub(r"\binnit\b", 'is it not', text, flags = flags)
    text = re.sub(r"\b(can)(not)\b", r'\g<1> \g<2>', text, flags = flags)
    text = re.sub(r"\bwanna\b", 'want to', text, flags = flags)
    text = re.sub(r"\bmethinks\b", 'me thinks', text, flags = flags)
    
    ## one offs
    text = re.sub(r"\bo'er\b", r'over', text, flags = flags)
    text = re.sub(r"\bne'er\b", r'never', text, flags = flags)
    text = re.sub(r"\bo'?clock\b", 'of the clock', text, flags = flags)
    text = re.sub(r"\bma'am\b", 'madam', text, flags = flags)
    text = re.sub(r"\bgiv'n\b", 'given', text, flags = flags)
    text = re.sub(r"\be'er\b", 'ever', text, flags = flags)
    text = re.sub(r"\bd'ye\b", 'do you', text, flags = flags)
    text = re.sub(r"\be'er\b", 'ever', text, flags = flags)
    text = re.sub(r"\bd'ye\b", 'do you', text, flags = flags)
    text = re.sub(r"\bg'?day\b", 'good day', text, flags = flags)
    text = re.sub(r"\b(ain|amn)'?t\b", 'am not', text, flags = flags)
    text = re.sub(r"\b(are|can)'?t\b", r'\g<1> not', text, flags = flags)
    text = re.sub(r"\b(let)'?s\b", r'\g<1> us', text, flags = flags)
    
    ## major expansions involving smaller
    text = re.sub(r"\by'all'dn't've'd\b", 'you all would not have had', text, flags = flags)
    text = re.sub(r"\by'all're\b", 'you all are', text, flags = flags)
    text = re.sub(r"\by'all'd've\b", 'you all would have', text, flags = flags)
    text = re.sub(r"(\s)y'all(\s)", r'\g<1>you all\g<2>', text, flags = flags)
    
    ## minor
    text = re.sub(r"\b(won)'?t\b", 'will not', text, flags = flags)
    text = re.sub(r"\bhe'd\b", 'he had', text, flags = flags)

    ## major
    text = re.sub(r"\b(I|we|who)'?d'?ve\b", r'\g<1> would have', text, flags = flags)
    text = re.sub(r"\b(could|would|must|should|would)n'?t'?ve\b", r'\g<1> not have', text, flags = flags)
    text = re.sub(r"\b(he)'?dn'?t'?ve'?d\b", r'\g<1> would not have had', text, flags = flags)
    text = re.sub(r"\b(daren|daresn|dasn)'?t", 'dare not', text, flags = flags)
    text = re.sub(r"\b(he|how|i|it|she|that|there|these|they|we|what|where|which|who|you)'?ll\b", r'\g<1> will', text, flags = flags)
    text = re.sub(r"\b(everybody|everyone|he|how|it|she|somebody|someone|something|that|there|this|what|when|where|which|who|why)'?s\b", r'\g<1> is', text, flags = flags)
    text = re.sub(r"\b(I)'?m'a\b", r'\g<1> am about to', text, flags = flags)
    text = re.sub(r"\b(I)'?m'o\b", r'\g<1> am going to', text, flags = flags)
    text = re.sub(r"\b(I)'?m\b", r'\g<1> am', text, flags = flags)
    text = re.sub(r"\bshan't\b", 'shall not', text, flags = flags)
    text = re.sub(r"\b(are|could|did|does|do|go|had|has|have|is|may|might|must|need|ought|shall|should|was|were|would)n'?t\b", r'\g<1> not', text, flags = flags)
    text = re.sub(r"\b(could|had|he|i|may|might|must|should|these|they|those|to|we|what|where|which|who|would|you)'?ve\b", r'\g<1> have', text, flags = flags)
    text = re.sub(r"\b(how|so|that|there|these|they|those|we|what|where|which|who|why|you)'?re\b", r'\g<1> are', text, flags = flags)
    text = re.sub(r"\b(I|it|she|that|there|they|we|which|you)'?d\b", r'\g<1> had', text, flags = flags)
    text = re.sub(r"\b(how|what|where|who|why)'?d\b", r'\g<1> did', text, flags = flags)
    
    return text

df['statement'] = [expand_contractions(statement) for statement in df['statement']]

In [12]:
# Expand abbreviations

abbr = {'u.n.': 'united nations',
        'a.m.': 'before midday',
        'n.y.': 'new york',
        'e.u.': 'european union',
        'u.s.': 'united states',
        'u.k.': 'united kingdom',
        'd.c.': 'district columbia',
        'a.k.a.': 'also known as',
        'r.i.p.': 'rest in peace',
        'n.h.': 'new hampshire',
        'r.i.': 'rhode island',
        's.c.': 'south carolina',
        
        'gop': 'the republican party',
        'usa': 'united states of america',
        'nato': 'north atlantic treaty organization',
        'epa': 'environmental protection agency',
                 
        'rep.': 'representative',
        'reps.': 'representatives',
        'dem.': 'democrat',
        'tenn.': 'tennessee',
        'capt': 'captain',
        'gov.': 'government',
        'sen.': 'senator',
        'mr.': 'mister',
        'ok': 'okay',
        'gen.': 'general',
        'jr.': 'junior',
       }

def expand_abbreviations(text):

    for key, value in abbr.items():
        if key in text:
            text = text.replace(key, value)
    
    return text

df['statement'] = [expand_abbreviations(statement) for statement in df['statement']]

In [13]:
# Remove special characters
df['statement'] = [re.sub(r"[^a-zA-Z0-9]+", ' ', statement) for statement in df['statement']]

In [14]:
# Remove numbers
df['statement'] = [re.sub(" \d+", " ", statement) for statement in df['statement']]

In [15]:
# Lowercase all texts
df['statement'] = [statement.lower() for statement in df['statement']]

In [16]:
# lemmatization
df['statement'] = [" ".join([token.lemma_ for token in nlp(statement)]) for statement in df['statement']]

In [17]:
df.head()

Unnamed: 0,label,statement
0,0,say the annie list political group support thi...
1,1,when do the decline of coal start -PRON- start...
2,1,hillary clinton agree with john mccain by vote...
3,0,health care reform legislation be likely to ma...
4,1,the economic turnaround start at the end of -P...


## Tokenize

In [18]:
output_dim = 100
maxlen = 100

In [19]:
embeddings_index = {}

f = open(os.path.join('glove.6B.100d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

In [20]:
# statement

statement_text = ''

for i in df['statement']:
    statement_text += i
    
max_words = len(set(statement_text.split()))

statement_texts = df['statement']

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(statement_texts)
statement_word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(statement_texts)
statement_data = pad_sequences(sequences, maxlen = maxlen)

statement_embedding_matrix = np.zeros((len(statement_word_index) + 1, output_dim))

for word, i in statement_word_index.items():
    statement_embedding_vector = embeddings_index.get(word)
    if statement_embedding_vector is not None:
        statement_embedding_matrix[i] = statement_embedding_vector

## Model parameters

In [21]:
# convolution
filters = 4
kernel_size = 4

In [23]:
# dense
units_dense = 10

In [24]:
# model
epochs = 2
batch_size = 256

## Build the model

In [25]:
# input
statement_input = keras.Input(shape = (None,), name = "statement_input")

# embedding
statement_features = layers.Embedding(len(statement_word_index) + 1, output_dim = output_dim, weights = [statement_embedding_matrix], trainable = False,name = "statement_embedding")(statement_input)

# 1d conv
statement_features = layers.Conv1D(filters, kernel_size, strides = 1, padding = 'same', activation='relu', name = "statement_convolution")(statement_features)

# self attention
statement_features = layers.MultiHeadAttention()

# maxpooling

# output

AttributeError: module 'tensorflow.keras.layers' has no attribute 'MultiHeadAttention'