# Detect claims to fact check in political debates

In this project you will implement various classifiers using both neural and feature based technqiues to detect which sentences in political debates should be fact checked.
Dataset from ClaimBuster: https://zenodo.org/record/3609356 
Evaluate your classifiers using the same metrics as http://ranger.uta.edu/~cli/pubs/2017/claimbuster-kdd17-hassan.pdf (Table 2)

Classification report from sklearn provides everything

In [23]:
# TODO:  Create advanced model(s) (suggestions are given below)
#           -- Generate more features that a model can use. For example the context around the sentence, sentiment, named entities etc.
#           -- Rule based classifier. For example, if sentence contains certain words, tags, statistics etc.
#           -- Deep learning (word embeddings, transformer models etc.)
#           -- Sub-sentence classifier. Long sentences may include several claims, so the goal is to mark the span of claim(s) within a sentence

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import collections
import string

from sklearn.cluster import KMeans
from sklearn.metrics import *
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier

import json
import glob
import re
import time 

# Loading and merging the data

In [25]:
file1 = pd.read_csv("data/crowdsourced.csv", encoding='utf-8')
file2 = pd.read_csv("data/groundtruth.csv", encoding='utf-8')
df = pd.concat([file1, file2])


df["date"] = df["File_id"].str.strip(to_strip=".txt")

df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace= True)
df["mos_before_election"] = 11 - df["date"].dt.month

df['index'] = pd.RangeIndex(len(df))
df.set_index('index', inplace=True)
df


Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.417840,-1,1960-09-26,2
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.000000,-1,1960-09-26,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23528,34028,"First of all, the media is so dishonest and so...",Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,17,907,0.032300,-1,2016-10-19,1
23529,34027,What I've seen -- what I've seen is so bad.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,9,906,-0.669600,-1,2016-10-19,1
23530,34026,I'll look at it at the time.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,905,0.000000,-1,2016-10-19,1
23531,34039,So I talk about the corrupt media.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,918,0.000000,-1,2016-10-19,1


# Data preprocessing

In [26]:
def remove_punctuation(text):
    tokens = re.sub('[^a-zA-Z]', ' ', text).lower()
    return tokens

In [27]:
def remove_stop_words(text):
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    return word_list

In [28]:
def get_word_stemm(word_list):
    """Stemmers remove morphological affixes from words, leaving only the word stem."""
    stemmer = SnowballStemmer('english')
    singles = [stemmer.stem(word) for word in word_list] 
    return singles

In [29]:
def preprocess_data(docs, stemming=False):

    text_list = [] 
    for doc in docs:  
        # 1. Remove punctuation and set as lower case
        text = remove_punctuation(doc)

        # 2. Remove stop words and extra spaces
        word_list = remove_stop_words(text)

        if stemming:
            # 3. Stemming
            word_list = get_word_stemm(word_list)

        joined_text = " ".join(word_list)
        text_list.append(joined_text)
        
    return text_list



In [30]:
data = preprocess_data(df.Text.values, stemming=True)

In [31]:
# add clean text to dataframe
df["Clean_text"] = pd.Series(data)

In [32]:
df.head()

Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election,Clean_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.41784,-1,1960-09-26,2,stand still
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2,three program quit moder
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2,propos advanc mr javit would cost six hundr mi...
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2,put deficit treasuri
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.0,-1,1960-09-26,2,third medic care age tie social secur financ s...


In [33]:
mask = df["date"].dt.year < 2012

X_train = df.loc[mask, "Clean_text"].values
y_train = df.loc[mask, "Verdict"].values

X_test = df.loc[~mask, "Clean_text"].values
y_test = df.loc[~mask, "Verdict"].values

# Word Embedding using keras - NOT WORKING YET :D 

## Extract Word Embeddings from Glove

In [34]:
# embeddings_index = dict()
# f = open('glove.6B/glove.6B.100d.txt')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B/glove.6B.100d.txt'

In [None]:
from keras.preprocessing.text import Tokenizer

embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [38]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [39]:
# defining vocabulary
vocabulary = {}
sentences_len = []
for sentence in df["Clean_text"]:
    for term in sentence.split():
        vocabulary.setdefault(term, len(vocabulary))

In [40]:
# Defining vocabulary size
vocabulary_size = list(vocabulary.values())[-1] + 1

print(f"vocabulary is composed of {vocabulary_size} unique words")

vocabulary is composed of 6916 unique words


## One hot encoding representation

In [41]:
encoded_vocab = [one_hot(words, vocabulary_size) for words in df["Clean_text"].values]

In [42]:
encoded_vocab

[[2644, 5474],
 [6759, 1742, 2383, 1009],
 [4222,
  5408,
  5070,
  1975,
  3016,
  4077,
  5716,
  682,
  4028,
  3933,
  5070,
  4205,
  1164,
  5862,
  557,
  583,
  5419,
  2347,
  583,
  3134,
  4474,
  6645],
 [1287, 6507, 4435],
 [776, 6178, 170, 2730, 6215, 4474, 6645, 2347, 4474, 6645, 3627],
 [6074, 376, 2132, 329],
 [6074, 376],
 [6074, 2722, 3774, 5611, 1742],
 [4655,
  1352,
  6806,
  925,
  5572,
  2219,
  2132,
  973,
  6722,
  5911,
  3267,
  3419,
  2722,
  993,
  2276,
  3362,
  2297,
  2219,
  2132,
  973,
  2722,
  329],
 [948, 2253, 1352, 4115, 6121, 5070, 3562, 1513, 4222],
 [6822, 3910, 6323, 5982, 1352, 6822, 6763, 2326, 818],
 [6518, 2746, 1934, 4559, 1352],
 [5911, 4655, 6806, 3243, 6553, 776, 6759, 768, 2132, 2297, 1518, 993, 4222],
 [2219,
  1352,
  3933,
  1905,
  4662,
  4898,
  4450,
  87,
  5056,
  6174,
  4711,
  4028,
  3933,
  3091,
  5253],
 [6912, 1868, 1033, 1352, 4293, 2795, 3199, 4655],
 [5073,
  2795,
  6636,
  4106,
  5928,
  5073,
  935,
  227

## Padding sequences

In [43]:
# finding max sentence length

vec_lengths = []
for i in encoded_vocab:
    vec_lengths.append(len(i))


max_length = np.unique(vec_lengths)[-1]
max_length

65

In [44]:
embedded_docs=pad_sequences(encoded_vocab,padding='post',maxlen=max_length)
print(embedded_docs)

[[2644 5474    0 ...    0    0    0]
 [6759 1742 2383 ...    0    0    0]
 [4222 5408 5070 ...    0    0    0]
 ...
 [1033  912    0 ...    0    0    0]
 [3918 1048 1415 ...    0    0    0]
 [5056   22  340 ...    0    0    0]]


## Creating the model

In [45]:
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Flatten, GlobalMaxPool1D, Embedding

from tensorflow.keras.backend import clear_session

In [56]:
clear_session()

In [57]:
model=Sequential()
model.add(Embedding(vocabulary_size,200,input_length=max_length))
#model.add(Bidirectional(LSTM(100)))
model.add(LSTM(10, return_sequences=True))
model.add(LSTM(10))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 200)           1383200   
                                                                 
 lstm (LSTM)                 (None, 65, 10)            8440      
                                                                 
 lstm_1 (LSTM)               (None, 10)                840       
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 1)                 11        
                                                                 
Total params: 1,392,491
Trainable params: 1,392,491
Non-trainable params: 0
_________________________________________________________________


In [48]:
labels = df['Verdict'].copy()

In [50]:
labels = labels.apply(lambda x: 1 if x==1 else 0)

In [51]:
labels

index
0        0
1        0
2        1
3        1
4        0
        ..
23528    0
23529    0
23530    0
23531    0
23532    0
Name: Verdict, Length: 23533, dtype: int64

In [52]:
# Preparing for training the model
X_embedded = np.array(embedded_docs)
# y = labels
y = df['Verdict'].values

# check shapes
print(f"X shape: {X_embedded.shape}. y shape: {y.shape}")

X shape: (23533, 65). y shape: (23533,)


In [53]:
# Train test split
X_embedded_train = X_embedded[:sum(mask)]
y_embedded_train = y[:sum(mask)]

X_embedded_test = X_embedded[sum(mask):]
y_embedded_test = y[sum(mask):]

print('Train shape: ', (X_embedded_train.shape, y_embedded_train.shape))
print('Test shape: ', (X_embedded_test.shape, y_embedded_test.shape))


Train shape:  ((18170, 65), (18170,))
Test shape:  ((5363, 65), (5363,))


In [61]:
model.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=10,batch_size=790)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29042d32f70>

In [None]:
validation_data=(X_embedded_test,y_embedded_test)

In [62]:
model1 = Sequential()
model1.add(Embedding(input_dim=vocabulary_size, 
                           output_dim=100, 
                           input_length=max_length))
model1.add(GlobalMaxPool1D())
model1.add(Dense(100, activation='relu'))

model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 65, 100)           691600    
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 701,801
Trainable params: 701,801
Non-trainable params: 0
_________________________________________________________________


In [64]:
model1.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=10,batch_size=395)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2902fb9c190>

In [65]:
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D


clear_session()

In [66]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 100)           691600    
                                                                 
 conv1d (Conv1D)             (None, 61, 128)           64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 757,029
Trainable params: 757,029
Non-trainable params: 0
__________________________________________________

In [67]:
model.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=10,batch_size=46)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2902fbb3190>