# Detect claims to fact check in political debates

In this project you will implement various classifiers using both neural and feature based technqiues to detect which sentences in political debates should be fact checked.
Dataset from ClaimBuster: https://zenodo.org/record/3609356 
Evaluate your classifiers using the same metrics as http://ranger.uta.edu/~cli/pubs/2017/claimbuster-kdd17-hassan.pdf (Table 2)

Classification report from sklearn provides everything

In [None]:
# TODO:  Create advanced model(s) (suggestions are given below)
#           -- Generate more features that a model can use. For example the context around the sentence, sentiment, named entities etc.
#           -- Rule based classifier. For example, if sentence contains certain words, tags, statistics etc.
#           -- Deep learning (word embeddings, transformer models etc.)
#           -- Sub-sentence classifier. Long sentences may include several claims, so the goal is to mark the span of claim(s) within a sentence

In [1]:
from tracemalloc import stop
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import collections
import string

from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import json
import glob
import re

In [5]:
file1 = pd.read_csv("data/crowdsourced.csv", encoding='utf-8')
file2 = pd.read_csv("data/groundtruth.csv", encoding='utf-8')
df = pd.concat([file1, file2])


df["date"] = df["File_id"].str.strip(to_strip=".txt")

df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace= True)
df["mos_before_election"] = 11 - df["date"].dt.month

df['index'] = pd.RangeIndex(len(df))
df.set_index('index', inplace=True)
df


Unnamed: 0_level_0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,date,mos_before_election
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,8211,"Now, this is not standing still.",Richard M. Nixon,Vice President,REPUBLICAN,1960-09-26.txt,6,114,-0.417840,-1,1960-09-26,2
1,8515,So these are three programs which are quite mo...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,418,0.249581,-1,1960-09-26,2
2,8514,The proposal advanced by you and by Mr. Javits...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,42,417,-0.626563,1,1960-09-26,2
3,8513,It does not put a deficit on the Treasury.,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,9,416,-0.629486,1,1960-09-26,2
4,8512,The third is medical care for the aged which i...,John F. Kennedy,Senator,DEMOCRAT,1960-09-26.txt,22,415,0.000000,-1,1960-09-26,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23528,34028,"First of all, the media is so dishonest and so...",Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,17,907,0.032300,-1,2016-10-19,1
23529,34027,What I've seen -- what I've seen is so bad.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,9,906,-0.669600,-1,2016-10-19,1
23530,34026,I'll look at it at the time.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,905,0.000000,-1,2016-10-19,1
23531,34039,So I talk about the corrupt media.,Donald Trump,Businessman,REPUBLICAN,2016-10-19.txt,7,918,0.000000,-1,2016-10-19,1


# Data cleaning

In [73]:
df['Text_clean'] = df['Text']
#punct = string.punctuation
characters = string.punctuation + '–0123456789‘’“”' 
translator = str.maketrans('', '', characters)
stop_words = stopwords.words('english')
stop_words.extend(["i'm", "i've", "i'll", "i'd", "  ", 'uh', "ah", "aah", 'weve', "we've", "we'd", "we'll",  "we're",
'aah', 'aarp', 'aayuh', 'ãƒâ', 'åwe', 'šâ'])

word_list = []
# text_dict = {}
for i in range(len(df)):
    words = df['Text'][i].replace('\n'," ").lower().split()
    u = [word for word in words if word not in stop_words]
    # text_dict.update({' '.join(u):i})
    clean_text = ' '.join(u).translate(translator)
    df.loc[i, 'Text_clean'] = clean_text
    word_list.append(clean_text.split())

flat_list = []
for sublist in word_list:
    for item in sublist:
        flat_list.append(item)

counter = collections.Counter(flat_list)
frequent_words = counter.most_common()

unique_word_dict = {}
for word in flat_list:
    unique_word_dict.setdefault(
        word, len(unique_word_dict)
    )

In [89]:
df['Text_clean'] = df['Text']
#punct = string.punctuation
characters = string.punctuation + '–0123456789‘’“”ãƒâšå' 
translator = str.maketrans('', '', characters)
stop_words = stopwords.words('english')
stop_words.extend(["i'm", "i've", "i'll", "i'd", "  ", 'uh', "ah", "aah", 'weve', "we've", "we'd", "we'll",  "we're",
'aah', 'aarp', 'aayuh'])

word_list = []
# text_dict = {}
for i in range(len(df)):
    text = df['Text'][i].lower().translate(translator)
    u = [word for word in text.split() if word not in stop_words]
    # text_dict.update({' '.join(u):i})
    clean_text = ' '.join(u)
    df.loc[i, 'Text_clean'] = clean_text
    word_list.append(u)

flat_list = []
for sublist in word_list:
    for item in sublist:
        flat_list.append(item)

counter = collections.Counter(flat_list)
frequent_words = counter.most_common()

unique_word_dict = {}
for word in flat_list:
    unique_word_dict.setdefault(
        word, len(unique_word_dict)
    )

# TF-idf

In [91]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Text_clean'])

feature_names = vectorizer.get_feature_names_out()
feature_names



array(['abandon', 'abandoned', 'abandoning', ..., 'zippo', 'zone',
       'zones'], dtype=object)

# Train test split

In [85]:
mask = df["date"].dt.year < 2012

x_train = X[mask]
x_test = X[~mask]

y_train = df.loc[mask, "Verdict"].values
y_test = df.loc[~mask, "Verdict"].values

# Base line model

1. SVM

In [None]:
clf = svm.SVC(kernel='linear') 
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names= ["NFS", "UFS", "CFS"]))
comparison_svm = classification_report(y_test, y_pred, target_names= ["NFS", "UFS", "CFS"])


In [None]:
clf = RandomForestClassifier(min_samples_split=5)
clf.fit(x_train, y_train)
y_pred_rf = clf.predict(x_test)

In [None]:
comparison_rf = classification_report(y_test, y_pred_rf, target_names= ["NFS", "UFS", "CFS"])
print(comparison_rf)