# Detect claims to fact check in political debates

In this project you will implement various classifiers using both neural and feature based technqiues to detect which sentences in political debates should be fact checked.
Dataset from ClaimBuster: https://zenodo.org/record/3609356 
Evaluate your classifiers using the same metrics as http://ranger.uta.edu/~cli/pubs/2017/claimbuster-kdd17-hassan.pdf (Table 2)

Classification report from sklearn provides everything

In [1]:
# TODO:  Create advanced model(s) (suggestions are given below)
#           -- Generate more features that a model can use. For example the context around the sentence, sentiment, named entities etc.
#           -- Rule based classifier. For example, if sentence contains certain words, tags, statistics etc.
#           -- Deep learning (word embeddings, transformer models etc.)
#           -- Sub-sentence classifier. Long sentences may include several claims, so the goal is to mark the span of claim(s) within a sentence

In [2]:
from tracemalloc import stop
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import collections
import string

from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import json
import glob
import re

# Loading and merging the data

In [3]:
file1 = pd.read_csv("data/crowdsourced.csv", encoding='utf-8')
file2 = pd.read_csv("data/groundtruth.csv", encoding='utf-8')
df = pd.concat([file1, file2])


df["date"] = df["File_id"].str.strip(to_strip=".txt")

df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace= True)
df["mos_before_election"] = 11 - df["date"].dt.month

df['index'] = pd.RangeIndex(len(df))
df.set_index('index', inplace=True)
df


Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.417840,-1,1960-09-26,2
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.000000,-1,1960-09-26,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23528,34028,"First of all, the media is so dishonest and so...",Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,17,907,0.032300,-1,2016-10-19,1
23529,34027,What I've seen -- what I've seen is so bad.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,9,906,-0.669600,-1,2016-10-19,1
23530,34026,I'll look at it at the time.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,905,0.000000,-1,2016-10-19,1
23531,34039,So I talk about the corrupt media.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,918,0.000000,-1,2016-10-19,1


# Data preprocessing

In [4]:
def remove_punctuation(text):
    tokens = re.sub('[^a-zA-Z]', ' ', text).lower()
    return tokens

In [5]:
def remove_stop_words(text):
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    return word_list

In [6]:
def get_word_stemm(word_list):
    """Stemmers remove morphological affixes from words, leaving only the word stem."""
    stemmer = SnowballStemmer('english')
    singles = [stemmer.stem(word) for word in word_list] 
    return singles

In [7]:
def preprocess_data(docs):

    text_list = [] 
    for doc in docs:  
        # 1. Remove punctuation and set as lower case
        text = remove_punctuation(doc)

        # 2. Remove stop words and extra spaces
        word_list = remove_stop_words(text)
        joined_text = " ".join(word_list)
        text_list.append(joined_text)
        
        # 3. Stemming
        # word_stem = get_word_stemm(word_list)
        # joined_text = " ".join(word_stem)
        # text_list.append(joined_text)


    return text_list



In [8]:
data = preprocess_data(df.Text.values)

# TF-idf

In [19]:
vectorizer = TfidfVectorizer(max_df=0.8, ngram_range=(1,3), stop_words='english')
vectors = vectorizer.fit_transform(data)

#dense = vectors.todense()
#denselist = dense.tolist()

feature_names = vectorizer.get_feature_names_out()



In [20]:
feature_names

array(['aah', 'aarp', 'aarp said', ..., 'zones pass dang', 'zones said',
       'zones said days'], dtype=object)

In [21]:
print(feature_names)

['aah' 'aarp' 'aarp said' ... 'zones pass dang' 'zones said'
 'zones said days']


In [22]:
feature_names[:20]

array(['aah', 'aarp', 'aarp said', 'aarp said plan', 'aarp thinks',
       'aarp thinks savings', 'aayuh', 'aayuh chairman',
       'aayuh chairman joint', 'abandon', 'abandon nuclear',
       'abandon nuclear ambitions', 'abandon peace',
       'abandon peace process', 'abandon quest', 'abandon quest nuclear',
       'abandon responsibilities', 'abandon trickle',
       'abandon trickle economics', 'abandoned'], dtype=object)

In [None]:
feature_names

In [None]:
all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)

In [None]:
all_keywords

In [None]:
true_k = 25

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

with open ("data/trc_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")



In [None]:
indices_kmeans = model.fit_predict(vectors)

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
scatter_plot_points = pca.fit_transform(vectors.toarray())



In [None]:
colors = [  '#4E6888',
            '#3515D5',
            '#CF8ED0',
            '#9075DC',
            '#10E664',
            '#0A717A',
            '#00277C',
            '#78862F',
            '#4D641A',
            '#E204BA',
            '#30601F',
            '#A14003',
            '#910B50',
            '#8F7175',
            '#0D055D',
            '#D2D5F9',
            '#C2501F',
            '#4B457E',
            '#4BD0EF',
            '#EA9A5B',
            '#E7FA3E',
            '#DE57EF',
            '#5C2DF0',
            '#2DBC02',
            '#02C101' ]

In [None]:
df["Speaker_initials"] = df["Speaker"]
df["party"] = df["Speaker"]
for i in range(len(df["Speaker"])):
    initials = [s[0] for s in df["Speaker"][i].split()]
    party_abrev = [s[0] for s in df["Speaker_party"][i].split()]

    df.loc[i,"Speaker_initials"] = "".join(initials).upper()
    df.loc[i,"party"] = "".join(party_abrev).upper()
df

In [None]:
x_axis = [point[0] for point in scatter_plot_points]
y_axis = [point[1] for point in scatter_plot_points]


fig, ax = plt.subplots(figsize = (50, 50))

ax.scatter(x_axis, y_axis, c= [colors[i] for i in indices_kmeans])

# for i, name in enumerate(df.Speaker_initials):
#     ax.annotate(name, (x_axis[i], y_axis[i]))

plt.savefig('trc.png')


# Train test split

In [None]:
mask = df["date"].dt.year < 2012


x_train = df.loc[mask, "Text_clean"].values
x_test = df.loc[~mask, "Text_clean"].values

y_train = df.loc[mask, "Verdict"].values
y_test = df.loc[~mask, "Verdict"].values

In [None]:
x_train

In [None]:
vectorizer = TfidfVectorizer(max_features= 1000)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.fit_transform(x_test)


# Base line model

1. SVM

In [None]:
clf = svm.SVC(kernel='linear') 
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names= ["NFS", "UFS", "CFS"]))
comparison_svm = classification_report(y_test, y_pred, target_names= ["NFS", "UFS", "CFS"])


In [None]:
clf = RandomForestClassifier(min_samples_split=7)
clf.fit(x_train, y_train)
y_pred_rf = clf.predict(x_test)

In [None]:
comparison_rf = classification_report(y_test, y_pred_rf, target_names= ["NFS", "UFS", "CFS"])
print(comparison_rf)

# Word Embedding using keras 

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
# Defining vocabulary size
vocabulary_size = list(unique_word_dict.values())[-1]

## One hot encoding representation

In [None]:
encoded_vocab = [one_hot(words, vocabulary_size) for words in df["Text_clean"].values]

## Padding sequences

In [None]:
# finding max sentence length

vec_lengths = []
for i in encoded_vocab:
    vec_lengths.append(len(i))


max_length = np.unique(vec_lengths)[-1]
max_length

In [None]:
embedded_docs=pad_sequences(encoded_vocab,padding='pre',maxlen=max_length)
print(embedded_docs)

In [None]:
model=Sequential()
model.add(Embedding(vocabulary_size,30,input_length=max_length))
model.compile('adam','mse')

In [None]:
model.summary()