# Detect claims to fact check in political debates

In this project you will implement various classifiers using both neural and feature based technqiues to detect which sentences in political debates should be fact checked.
Dataset from ClaimBuster: https://zenodo.org/record/3609356 
Evaluate your classifiers using the same metrics as http://ranger.uta.edu/~cli/pubs/2017/claimbuster-kdd17-hassan.pdf (Table 2)

Classification report from sklearn provides everything

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import *

# Loading the data

In [2]:
df = pd.read_csv("../data_preprocessing/data.csv")
df['date'] = pd.to_datetime(df['date'])
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23462 entries, 0 to 23461
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   index       23462 non-null  int64         
 1   date        23462 non-null  datetime64[ns]
 2   Text        23462 non-null  object        
 3   Clean_text  23462 non-null  object        
 4   Verdict     23462 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 916.6+ KB


# Train-test split

In [3]:
mask = df["date"].dt.year < 2012

X_train = df.loc[mask, "Clean_text"].values
y_train = df.loc[mask, "Verdict"].values

X_test = df.loc[~mask, "Clean_text"].values
y_test = df.loc[~mask, "Verdict"].values

# Data Preprocessing

In [4]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [5]:
# defining vocabulary
vocabulary = {}
sentences_len = []
for sentence in X_train:
    for term in sentence.split():
        vocabulary.setdefault(term, len(vocabulary))

In [6]:
# Defining vocabulary size
vocabulary_size = list(vocabulary.values())[-1] + 1

print(f"vocabulary is composed of {vocabulary_size} unique words")

vocabulary is composed of 10205 unique words


## One hot encoding representation

Encoding train data

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)

Encodding Test data

In [8]:
X_test_encoded = tokenizer.texts_to_sequences(X_test)

## Padding sequences

In [9]:
# finding max sentence length

vec_lengths = []
for i in X_train_encoded:
    vec_lengths.append(len(i))


max_length = np.unique(vec_lengths)[-1]


In [10]:
X_train_embedded=pad_sequences(X_train_encoded,padding='post',maxlen=max_length)
print(X_train_embedded)

[[  783   148     0 ...     0     0     0]
 [  130   110   771 ...     0     0     0]
 [  462  2841    30 ...     0     0     0]
 ...
 [    2  6525    43 ...     0     0     0]
 [ 1245    49   566 ...     0     0     0]
 [10205   264     1 ...     0     0     0]]


### For training

In [11]:
X_train_embedded.shape

(18118, 65)

### For testing

In [12]:
X_test_embedded=pad_sequences(X_test_encoded,padding='post',maxlen=max_length)
print(X_test_embedded.shape)

(5344, 65)


### For the labels

In [13]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(y_train.reshape(-1, 1))
y_encoded = one_hot_encoder.transform(y_train.reshape(-1, 1))

y_encoded.shape

(18118, 3)

In [14]:
y_encoded_test = one_hot_encoder.transform(y_test.reshape(-1,1))
y_encoded_test.shape

(5344, 3)

# Creating the model

In [115]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import GlobalMaxPool1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.backend import clear_session

In [116]:
clear_session()


### Defining plotting function

In [117]:
def plot_model_history(model_history=model.history):
    # plot loss during training
    plt.subplot(211)
    plt.title('Loss')
    plt.plot(model_history.history['loss'], label='train')
    plt.plot(model_history.history['val_loss'], label='test')
    plt.legend()
    # plot accuracy during training
    plt.subplot(212)
    plt.title('Accuracy')
    plt.plot(model_history.history['accuracy'], label='train')
    plt.plot(model_history.history['val_accuracy'], label='test')
    plt.legend()
    plt.subplots_adjust(hspace=0.5)
    plt.show()

## LSTM

In [118]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size+1, 81, input_length=max_length))
model_lstm.add(LSTM(100))
model_lstm.add(Dense(81, activation = "relu"))
model_lstm.add(Dense(3, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_lstm.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 81)            826686    
                                                                 
 lstm (LSTM)                 (None, 100)               72800     
                                                                 
 dense (Dense)               (None, 81)                8181      
                                                                 
 dense_1 (Dense)             (None, 3)                 246       
                                                                 
Total params: 907,913
Trainable params: 907,913
Non-trainable params: 0
_________________________________________________________________
None


In [59]:
model_lstm.fit(X_train_embedded,y_encoded, validation_split=0.2, epochs = 40)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a8b3669dc0>

In [None]:
predictions = model_lstm.predict(X_test_embedded)
preds = one_hot_encoder.inverse_transform(predictions).reshape(-1,)
print(classification_report(y_test, preds, target_names=["NFS", "UFS", "CFS"]))


## Bidirectional LSTM 

In [53]:
model_bi = Sequential()
model_bi.add(Embedding(vocabulary_size+1, 97, input_length=max_length))
model_bi.add(Bidirectional(LSTM(100)))
model_bi.add(Dropout(0.5))
model_bi.add(Dense(97, activation = "relu"))
model_bi.add(Dense(3, activation='softmax'))
model_bi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_bi.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 65, 97)            989982    
                                                                 
 bidirectional_5 (Bidirectio  (None, 220)              183040    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 220)               0         
                                                                 
 dense_12 (Dense)            (None, 97)                21437     
                                                                 
 dense_13 (Dense)            (None, 3)                 294       
                                                                 
Total params: 1,194,753
Trainable params: 1,194,753
Non-trainable params: 0
____________________________________________

In [54]:
model_bi.fit(X_train_embedded,y_encoded, validation_split=0.2, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a8a9a1c8e0>

In [55]:
predictions = model_bi.predict(X_test_embedded)
preds = one_hot_encoder.inverse_transform(predictions).reshape(-1,)
print(classification_report(y_test, preds, target_names=["NFS", "UFS", "CFS"]))


              precision    recall  f1-score   support

         NFS       0.75      0.89      0.81      3296
         UFS       0.39      0.25      0.30       623
         CFS       0.64      0.46      0.54      1425

    accuracy                           0.70      5344
   macro avg       0.59      0.53      0.55      5344
weighted avg       0.68      0.70      0.68      5344



## Stacked Bi-LSTM

In [103]:
model_bi = Sequential()
model_bi.add(Embedding(vocabulary_size+1, 200, input_length=max_length))
model_bi.add(Dropout(0.2))
model_bi.add(Bidirectional(LSTM(100, return_sequences=True)))
model_bi.add(Bidirectional(LSTM(100)))
model_bi.add(Dropout(0.2))
model_bi.add(Dense(97, activation = "relu"))
model_bi.add(Dense(3, activation='softmax'))
model_bi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_bi.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 200)           2041200   
                                                                 
 dropout (Dropout)           (None, 65, 200)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 65, 200)          240800    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 97)                1

In [104]:
model_bi.fit(X_train_embedded,y_encoded, validation_split=0.2, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2a8cc1399a0>

In [101]:
predictions = model_bi.predict(X_test_embedded)
preds = one_hot_encoder.inverse_transform(predictions).reshape(-1,)
print(classification_report(y_test, preds, target_names=["NFS", "UFS", "CFS"]))


              precision    recall  f1-score   support

         NFS       0.75      0.91      0.82      3296
         UFS       0.36      0.28      0.32       623
         CFS       0.68      0.41      0.51      1425

    accuracy                           0.70      5344
   macro avg       0.60      0.53      0.55      5344
weighted avg       0.68      0.70      0.68      5344



## Convolutional Neural Network

In [85]:
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D
clear_session()


In [82]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocabulary_size+1, embedding_dim, input_length=max_length))
model.add(Conv1D(128, 10, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 65, 100)           1020600   
                                                                 
 conv1d_1 (Conv1D)           (None, 56, 300)           300300    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 300)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                19264     
                                                                 
 dense_3 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,340,359
Trainable params: 1,340,359
Non-trainable params: 0
____________________________________________

In [83]:
model.fit(X_train_embedded,y_encoded, validation_split=0.2, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a8b7e95220>

In [84]:
predictions = model.predict(X_test_embedded)
preds = one_hot_encoder.inverse_transform(predictions).reshape(-1,)
print(classification_report(y_test, preds, target_names=["NFS", "UFS", "CFS"]))

              precision    recall  f1-score   support

         NFS       0.75      0.83      0.79      3296
         UFS       0.35      0.16      0.22       623
         CFS       0.53      0.53      0.53      1425

    accuracy                           0.67      5344
   macro avg       0.54      0.51      0.51      5344
weighted avg       0.65      0.67      0.65      5344



## Convolutional Neural network + LSTM

In [86]:
model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size+1, 100, input_length=max_length))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(100, 8, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=10))
model_conv.add(LSTM(100))
model_conv.add(Dense(32, activation = "relu"))
model_conv.add(Dense(3, activation='softmax'))
model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [87]:
model_conv.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 65, 100)           1020600   
                                                                 
 dropout (Dropout)           (None, 65, 100)           0         
                                                                 
 conv1d (Conv1D)             (None, 58, 100)           80100     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 5, 100)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 32)                3232      
                                                        

In [93]:
model_conv.fit(X_train_embedded,y_encoded, validation_split=0.4, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2a881fbceb0>

In [92]:
predictions = model_conv.predict(X_test_embedded)
preds = one_hot_encoder.inverse_transform(predictions).reshape(-1,)
print(classification_report(y_test, preds, target_names=["NFS", "UFS", "CFS"]))


              precision    recall  f1-score   support

         NFS       0.74      0.89      0.81      3296
         UFS       0.26      0.24      0.25       623
         CFS       0.66      0.36      0.46      1425

    accuracy                           0.67      5344
   macro avg       0.55      0.50      0.51      5344
weighted avg       0.66      0.67      0.65      5344

