# Detect claims to fact check in political debates

In this project you will implement various classifiers using both neural and feature based technqiues to detect which sentences in political debates should be fact checked.
Dataset from ClaimBuster: https://zenodo.org/record/3609356 
Evaluate your classifiers using the same metrics as http://ranger.uta.edu/~cli/pubs/2017/claimbuster-kdd17-hassan.pdf (Table 2)

Classification report from sklearn provides everything

In [1]:
# TODO:  Create advanced model(s) (suggestions are given below)
#           -- Generate more features that a model can use. For example the context around the sentence, sentiment, named entities etc.
#           -- Rule based classifier. For example, if sentence contains certain words, tags, statistics etc.
#           -- Deep learning (word embeddings, transformer models etc.)
#           -- Sub-sentence classifier. Long sentences may include several claims, so the goal is to mark the span of claim(s) within a sentence

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import collections
import string

from sklearn.cluster import KMeans
from sklearn.metrics import *
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier

import json
import glob
import re
import time 

# Loading and merging the data

In [3]:
file1 = pd.read_csv("data/crowdsourced.csv", encoding='utf-8')
file2 = pd.read_csv("data/groundtruth.csv", encoding='utf-8')
df = pd.concat([file1, file2])


df["date"] = df["File_id"].str.strip(to_strip=".txt")

df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace= True)
df["mos_before_election"] = 11 - df["date"].dt.month

df['index'] = pd.RangeIndex(len(df))
df.set_index('index', inplace=True)
df


Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.417840,-1,1960-09-26,2
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.000000,-1,1960-09-26,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23528,34028,"First of all, the media is so dishonest and so...",Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,17,907,0.032300,-1,2016-10-19,1
23529,34027,What I've seen -- what I've seen is so bad.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,9,906,-0.669600,-1,2016-10-19,1
23530,34026,I'll look at it at the time.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,905,0.000000,-1,2016-10-19,1
23531,34039,So I talk about the corrupt media.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,918,0.000000,-1,2016-10-19,1


# Data preprocessing

In [4]:
def remove_punctuation(text):
    tokens = re.sub('[^a-zA-Z]', ' ', text).lower()
    return tokens

In [5]:
def remove_stop_words(text):
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    return word_list

In [6]:
def get_word_stemm(word_list):
    """Stemmers remove morphological affixes from words, leaving only the word stem."""
    stemmer = SnowballStemmer('english')
    singles = [stemmer.stem(word) for word in word_list] 
    return singles

In [7]:
def preprocess_data(docs):

    text_list = [] 
    for doc in docs:  
        # 1. Remove punctuation and set as lower case
        text = remove_punctuation(doc)

        # 2. Remove stop words and extra spaces
        word_list = remove_stop_words(text)
        joined_text = " ".join(word_list)
        text_list.append(joined_text)
        
        # 3. Stemming
        # word_stem = get_word_stemm(word_list)
        # joined_text = " ".join(word_stem)
        # text_list.append(joined_text)


    return text_list



In [8]:
data = preprocess_data(df.Text.values)

In [9]:
# add clean text to dataframe
df["Clean_text"] = pd.Series(data)

In [10]:
df.head()

Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election,Clean_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.41784,-1,1960-09-26,2,standing still
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2,three programs quite moderate
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2,proposal advanced mr javits would cost six hun...
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2,put deficit treasury
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.0,-1,1960-09-26,2,third medical care aged tied social security f...


# Word Embedding using keras - NOT WORKING YET :D 

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
# defining vocabulary
vocabulary = {}
sentences_len = []
for sentence in df["Clean_text"]:
    for term in sentence.split():
        vocabulary.setdefault(term, len(vocabulary))

In [None]:
# Defining vocabulary size
vocabulary_size = list(vocabulary.values())[-1] + 1

print(f"vocabulary is composed of {vocabulary_size} unique words")

## One hot encoding representation

In [None]:
encoded_vocab = [one_hot(words, vocabulary_size) for words in df["Clean_text"].values]

In [None]:
encoded_vocab

## Padding sequences

In [None]:
# finding max sentence length

vec_lengths = []
for i in encoded_vocab:
    vec_lengths.append(len(i))


max_length = np.unique(vec_lengths)[-1]
max_length

In [None]:
embedded_docs=pad_sequences(encoded_vocab,padding='pre',maxlen=max_length)
print(embedded_docs)

## Creating the model

In [None]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import GlobalMaxPool1D

In [None]:
# model=Sequential()
# model.add(Embedding(vocabulary_size,50,input_length=max_length))
# model.add(Bidirectional(LSTM(100)))
# model.add(Dropout(0.3))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
# model.summary()

In [None]:
labels = df['Verdict'].copy()

In [None]:
labels.apply(lambda x: 1 if x==1 else 0)

In [None]:
# Preparing for training the model
X_embedded = np.array(embedded_docs)
y = labels

# check shapes
print(f"X shape: {X_embedded.shape}. y shape: {y.shape}")

In [None]:
# Train test split
X_embedded_train = X_embedded[:sum(mask)]
y_embedded_train = y[:sum(mask)]

X_embedded_test = X_embedded[sum(mask):]
y_embedded_test = y[sum(mask):]

print('Train shape: ', (X_embedded_train.shape, y_embedded_train.shape))
print('Test shape: ', (X_embedded_test.shape, y_embedded_test.shape))


In [None]:
# model.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=15,batch_size=115)

In [None]:
model1 = Sequential()
model1.add(Embedding(input_dim=vocabulary_size, 
                           output_dim=30, 
                           input_length=max_length))
#model.add(Flatten())
model1.add(GlobalMaxPool1D())
model1.add(Dense(10, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model1.summary()

In [None]:
model1.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=20,batch_size=46)

In [None]:
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D

In [None]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_embedded_train,y_embedded_train,validation_data=(X_embedded_test,y_embedded_test),epochs=20,batch_size=46)