In [3]:
# for Data Manipulation
import pandas as pd
import numpy as np
import os
import csv 

# for EDA
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline   
from collections import Counter

# for NLP
from textatistic import Textatistic
import spacy
from spacy import displacy

# for Statistics
from scipy import stats

In [4]:
# Use cleaner absolute path to find file
path = os.path.abspath('2022_03_13_politifact_cleaned_data.csv')
df = pd.read_csv(path, index_col=0)

# to see statement in full
pd.set_option('display.max_colwidth', 200) 

# See sample of data
df.head(3)

Unnamed: 0,statement,source,link,veracity
0,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
1,A video shows that “OnlyFans has guys on campus encouraging your daughters to do porn for ‘financial freedom.’”,Tweets,/factchecks/2022/feb/14/tweets/no-onlyfans-isnt-recruiting-adult-film-stars-colle/,0
2,“Nevada ranks 50th in election integrity ratings.”,Jesse Haw,/factchecks/2022/feb/14/jesse-haw/fact-checking-claim-nevada-ranks-close-last-electi/,1


# Should remove statements that begin with "Says entity said" as there are 2 reasons this may be false: speaker is false or statement is false.

Filter for sentences that begin with a quotation

In [5]:
df[df.statement.str.match(r"\"")==True][0:100:10]

Unnamed: 0,statement,source,link,veracity
0,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
82,"""In Joe Biden’s America, your children are more likely to have access to a crack pipe than a mask-free education.""",Troy Nehls,/factchecks/2022/feb/10/troy-nehls/claim-kids-more-likely-have-access-crack-pipes-mas/,0
150,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
232,"""In Joe Biden’s America, your children are more likely to have access to a crack pipe than a mask-free education.""",Troy Nehls,/factchecks/2022/feb/10/troy-nehls/claim-kids-more-likely-have-access-crack-pipes-mas/,0
300,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
382,"""In Joe Biden’s America, your children are more likely to have access to a crack pipe than a mask-free education.""",Troy Nehls,/factchecks/2022/feb/10/troy-nehls/claim-kids-more-likely-have-access-crack-pipes-mas/,0
450,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
532,"""In Joe Biden’s America, your children are more likely to have access to a crack pipe than a mask-free education.""",Troy Nehls,/factchecks/2022/feb/10/troy-nehls/claim-kids-more-likely-have-access-crack-pipes-mas/,0
600,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
682,"""In Joe Biden’s America, your children are more likely to have access to a crack pipe than a mask-free education.""",Troy Nehls,/factchecks/2022/feb/10/troy-nehls/claim-kids-more-likely-have-access-crack-pipes-mas/,0


In [6]:
df[df.statement.str.match(r"\"")==False][0:100:10]

Unnamed: 0,statement,source,link,veracity
1,A video shows that “OnlyFans has guys on campus encouraging your daughters to do porn for ‘financial freedom.’”,Tweets,/factchecks/2022/feb/14/tweets/no-onlyfans-isnt-recruiting-adult-film-stars-colle/,0
11,"In England, “official data shows children are up to 52 times more likely to die following COVID-19 vaccination than unvaccinated children.”",Bloggers,/factchecks/2022/feb/11/blog-posting/no-covid-19-vaccine-not-increasing-child-mortality/,0
24,“MSNBC announced Trump won 2020 election.”,Facebook posts,/factchecks/2022/feb/10/facebook-posts/no-msnbc-didnt-announce-donald-trump-won-2020-elec/,0
37,"“New Jersey and Maryland produce more solar power than Florida, the Sunshine State!”",Charlie Crist,/factchecks/2022/feb/11/charlie-crist/florida-produces-more-solar-power-most-states-thou/,1
48,“Joe Biden uses the ATF to illegally track your gun transactions.”,JD Vance,/factchecks/2022/feb/10/jd-vance/jd-vance-wrongly-says-atf-illegally-tracking-gun-s/,0
63,“The second booster has eight strains of HIV.”,Facebook posts,/factchecks/2022/feb/14/facebook-posts/covid-19-vaccines-do-not-contain-hiv/,0
74,Says Bob Saget predicted his death,Facebook posts,/factchecks/2022/feb/10/facebook-posts/no-bob-saget-didnt-predict-his-death/,0
86,"Says C.S. Lewis wrote, “They believed blindly everything they heard or read in the papers. They gave up their freedoms.”",Facebook posts,/factchecks/2022/feb/10/facebook-posts/cs-lewis-didnt-write-pandemic-passage/,0
99,Arizona House bill to “decertify” 2020 results “could effectively recall the Biden electors.”,Bloggers,/factchecks/2022/feb/11/blog-posting/gop-lawmakers-proposal-decertify-arizona-election-/,0
110,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0


In [7]:
# Let's see how much data remains with this filtering method, i.e., how many are True
Counter(df.statement.str.match(r"\""))

Counter({True: 8591, False: 25921})

In [9]:
# verify above results using a different method
Counter(df.statement.str.startswith('"'))

Counter({True: 8591, False: 25921})

# Conclusion: 3415 rows survive

In [10]:
df[df.statement.str.contains(r"Says|Quotes|Say|Said|say|saying|said")==True][0:100:10]

Unnamed: 0,statement,source,link,veracity
10,Says a Washington Post headline and graphic about COVID-19 deaths are misleading.,Facebook posts,/factchecks/2022/feb/11/facebook-posts/facebook-post-cries-foul-washington-post-headline-/,0
80,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0
160,Says a Washington Post headline and graphic about COVID-19 deaths are misleading.,Facebook posts,/factchecks/2022/feb/11/facebook-posts/facebook-post-cries-foul-washington-post-headline-/,0
230,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0
310,Says a Washington Post headline and graphic about COVID-19 deaths are misleading.,Facebook posts,/factchecks/2022/feb/11/facebook-posts/facebook-post-cries-foul-washington-post-headline-/,0
380,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0
460,Says a Washington Post headline and graphic about COVID-19 deaths are misleading.,Facebook posts,/factchecks/2022/feb/11/facebook-posts/facebook-post-cries-foul-washington-post-headline-/,0
530,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0
610,Says a Washington Post headline and graphic about COVID-19 deaths are misleading.,Facebook posts,/factchecks/2022/feb/11/facebook-posts/facebook-post-cries-foul-washington-post-headline-/,0
680,Says remdesivir is responsible for killing patients hospitalized with COVID-19.,Nicole Sirotek,/factchecks/2022/feb/10/nicole-sirotek/no-hospitalized-covid-19-patients-werent-killed-re/,0


In [11]:
df[df.statement.str.contains(r"Says|Quotes|Say|Said|say|saying|said")==False][0:100:10]

Unnamed: 0,statement,source,link,veracity
0,"""Books banned in Texas include 1984, Maus, and The Handmaid's Tale, but not Mein Kampf.""",Tweets,/factchecks/2022/feb/14/tweets/those-titles-arent-banned-statewide-could-be-distr/,0
11,"In England, “official data shows children are up to 52 times more likely to die following COVID-19 vaccination than unvaccinated children.”",Bloggers,/factchecks/2022/feb/11/blog-posting/no-covid-19-vaccine-not-increasing-child-mortality/,0
24,“MSNBC announced Trump won 2020 election.”,Facebook posts,/factchecks/2022/feb/10/facebook-posts/no-msnbc-didnt-announce-donald-trump-won-2020-elec/,0
36,"COVID spelled backward is “divoc” which means ""possession of the evil spirit"" in Hebrew.",Facebook posts,/factchecks/2022/feb/11/facebook-posts/covid-spelled-backward-divoc-which-means-possessio/,0
48,“Joe Biden uses the ATF to illegally track your gun transactions.”,JD Vance,/factchecks/2022/feb/10/jd-vance/jd-vance-wrongly-says-atf-illegally-tracking-gun-s/,0
62,“Nevada ranks 50th in election integrity ratings.”,Jesse Haw,/factchecks/2022/feb/14/jesse-haw/fact-checking-claim-nevada-ranks-close-last-electi/,1
73,"""The use of ballot drop boxes and ballot harvesting is illegal.”",Kevin Nicholson,/factchecks/2022/feb/11/kevin-nicholson/wisconsin-gop-gubernatorial-candidate-kevin-nichol/,1
87,"""Biden and most Democrats want to cap insulin prices at $35 per month. All 50 Republicans in the Senate are opposed to it.”",Facebook posts,/factchecks/2022/feb/10/facebook-posts/republicans-opposed-build-back-better-whole-not-ne/,1
98,“We all know China created COVID.”,Dave McCormick,/factchecks/2022/feb/11/dave-mccormick/response-dr-oz-no-evidence-senate-hopeful-mccormic/,0
111,The Biden administration “is spending $30 million on crack pipes.”,Sean Spicer,/factchecks/2022/feb/10/sean-spicer/no-white-house-isnt-spending-30-million-crack-pipe/,1


In [12]:
Counter(df.statement.str.contains(r"Says|Quotes|Say|Said|say|saying|said"))

Counter({False: 28208, True: 6304})

# We'll use a begins-with-and-ends-with-quotations filter going forward


In [13]:
quote_df = df[df.statement.str.match(r'\"')==True]
quote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8591 entries, 0 to 11187
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  8591 non-null   object
 1   source     8591 non-null   object
 2   link       8591 non-null   object
 3   veracity   8591 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 335.6+ KB


In [14]:
# remove sentences that don't end with a quotation
quote_df = quote_df[quote_df.statement.str.endswith('"')==True]
quote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6555 entries, 0 to 11187
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  6555 non-null   object
 1   source     6555 non-null   object
 2   link       6555 non-null   object
 3   veracity   6555 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 256.1+ KB


In [15]:
# remove sentences that are several quotes strung together
quote_df = quote_df[quote_df.statement.str.contains(r'\"{1}.+\"{1}.+\"{1}')==False]
quote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6395 entries, 0 to 11187
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  6395 non-null   object
 1   source     6395 non-null   object
 2   link       6395 non-null   object
 3   veracity   6395 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 249.8+ KB


In [16]:
quote_df.veracity.value_counts()

0    4098
1    2297
Name: veracity, dtype: int64


quote_df.describe(include='all')

In [18]:
# top 10 sources
quote_df.source.value_counts()[:10]

Tweets             837
Troy Nehls         833
Barack Obama       229
Donald Trump       196
Hillary Clinton    122
Bloggers           119
John McCain         81
Mitt Romney         70
Rick Scott          62
Marco Rubio         58
Name: source, dtype: int64

There is an imbalance in the sources (some appear far too often)# There is an imbalance in the sources (some appear far too often)

# By examining the source frequency's mean and median, we can determine how many times most sources are quoted to figure out a way to balance the data

In [19]:
quote_df.source.value_counts().mean()

3.724519510774607

In [20]:
quote_df.source.value_counts().median()

1.0

In [21]:
sum(quote_df.source.value_counts() <= 4)

1568

In [22]:
quote_df['freq'] = quote_df.groupby('source')['source'].transform('count')

In [24]:
quote_df[quote_df.freq <= 4].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2165 entries, 3 to 11183
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  2165 non-null   object
 1   source     2165 non-null   object
 2   link       2165 non-null   object
 3   veracity   2165 non-null   int64 
 4   freq       2165 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 101.5+ KB


# New dataframe will only tolerate a single source appearing 4 times

In [26]:
balanced_df = quote_df[quote_df.freq <= 4]

In [27]:
# how many unique sources?
balanced_df.describe(include='all')

Unnamed: 0,statement,source,link,veracity,freq
count,2165,2165,2165,2165.0,2165.0
unique,2161,1568,2165,,
top,"""Tim Kaine doesn’t want a border at all. He wants to get rid of Immigration and Customs Enforcement and basically the border patrol as well.""",Republican Governors Association,/web/20180705082623/https://www.politifact.com/truth-o-meter/statements/2018/jul/03/corey-stewart/corey-stewart-falsely-claims-sen-tim-kaine-wants-d/,,
freq,2,4,1,,
mean,,,,0.494226,1.787067
std,,,,0.500082,1.010764
min,,,,0.0,1.0
25%,,,,0.0,1.0
50%,,,,0.0,1.0
75%,,,,1.0,2.0


In [28]:
balanced_df.veracity.value_counts()

0    1095
1    1070
Name: veracity, dtype: int64

In [29]:
# top 10 sources
balanced_df.source.value_counts()[:10]

Republican Governors Association    4
Adam Hasner                         4
Joseph Kyrillos                     4
Betty Sutton                        4
Lamar Alexander                     4
Mark Warner                         4
Hank Johnson                        4
Adam Putnam                         4
Catherine Hanaway                   4
Kamala Harris                       4
Name: source, dtype: int64

# 2165 balanced rows survived

In [30]:
balanced_df.to_csv('politifact_balanced_data.csv')