In [1]:
import pandas as pd

cleaned_balanced_df = pd.read_csv("politifact_balanced_data.csv")

In [2]:
print(len(cleaned_balanced_df))

2165


# Study the lenght of the recorded variables

In [3]:
cleaned_balanced_df["length"] = cleaned_balanced_df.statement.str.len()

In [4]:
cleaned_balanced_df.describe()

Unnamed: 0.1,Unnamed: 0,veracity,freq,length
count,2165.0,2165.0,2165.0,2165.0
mean,5188.898845,0.494226,1.787067,107.79261
std,3195.945681,0.500082,1.010764,46.179039
min,3.0,0.0,1.0,20.0
25%,2288.0,0.0,1.0,75.0
50%,5032.0,0.0,1.0,100.0
75%,8082.0,1.0,2.0,133.0
max,11183.0,1.0,4.0,400.0


In [5]:
cleaned_balanced_df[cleaned_balanced_df.length > 250][:100]

Unnamed: 0.1,Unnamed: 0,statement,source,link,veracity,freq,length
68,199,"""Ninety percent of people born in the 1940s en...",Delaine Eastin,/web/20180119164739/http://www.politifact.com/...,1,1,292
77,223,"""In the eight years that I was mayor, (there w...",Antonio Villaraigosa,/web/20180119165429/http://www.politifact.com/...,1,4,308
99,317,"""Facts have come to light that indicate that a...",Kris Kobach,/web/20180119170716/http://www.politifact.com/...,0,1,257
194,689,"""When I became mayor of Los Angeles, we had a ...",Antonio Villaraigosa,/web/20180119171252/http://www.politifact.com/...,1,4,318
259,939,"""Local authorities in 43 states refused to hon...",Steve Adler,/web/20180119171447/http://www.politifact.com/...,1,4,290
278,1023,"""From the release of a child rapist from the D...",Joe DeFelice,/web/20180119171520/http://www.politifact.com/...,0,1,273
360,1374,"""If you give guns to gangs, you can get out of...",Loretta Sanchez,/web/20180119171703/http://www.politifact.com/...,0,2,256
363,1383,"""It’s bad enough that (Stephen) Silberkraus ap...",Lesley Cohen,/web/20180119171707/http://www.politifact.com/...,0,1,285
402,1601,"""Because there's no local option to allow comm...",Vote No On 2,/web/20180119171753/http://www.politifact.com/...,0,1,289
418,1653,"""Illinois suffered 1,652 overdose deaths in 20...",Richard Durbin,/web/20180119171804/http://www.politifact.com/...,1,3,291


# Analyze the data

In [6]:
import numpy as np
import os
import csv 

# for EDA
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline   
from collections import Counter

# for NLP
from textatistic import Textatistic
import spacy
from spacy import displacy

# for Statistics
from scipy import stats

In [7]:
# Obs.after preprocessing, the text is quite balanced out
cleaned_balanced_df.veracity.value_counts()

0    1095
1    1070
Name: veracity, dtype: int64

# Preprocessing Steps: Creating a condensed (lemmatized) text

In [8]:
nlp = spacy.load('en_core_web_md')

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [9]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [10]:
# lemmatization function
def lemmatize(text):
    # create nlp object
    nobj = nlp(text)
    
    # stopword list from spacy
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    
    # simple lemmas
    lemmas = [token.lemma_ for token in nobj]
    
    # lowercase; remove non-alphabetic characters & overly used words, i.e., stop words
    a_lemmas = [lemma.lower() for lemma in lemmas
                if lemma.isalpha() 
                and lemma not in stopwords]
    
    # strings
    strings = " ".join(a_lemmas)
    
    return strings

In [11]:
# sample: removes whitespaces, puncuation, stopwords, special characters; converts to lowercase
lemmatize(" ! i don't, won't,   can't not use    NLP 27x maaaah?")

'i use nlp maaaah'

In [13]:
cleaned_balanced_df["lemmas"] = cleaned_balanced_df.statement.apply(lemmatize)

In [14]:
cleaned_balanced_df[500:1500:500]

Unnamed: 0.1,Unnamed: 0,statement,source,link,veracity,freq,length,lemmas
500,2088,"""The city of Charlotte passed a bathroom ordin...",Pat McCrory,/web/20180119171924/http://www.politifact.com/...,0,3,106,city charlotte pass bathroom ordinance mandate...
1000,4750,"""What do Rutgers University, Kean University, ...",Joseph Cryan,/web/20180119172713/http://www.politifact.com/...,1,1,254,rutgers university kean university montclair s...


# Tagging Text

# Part of Speech (POS) Tagging

## .pos_ (general pos tagging -- 16 tags)


In [15]:
# .pos_ example
nobj = nlp(cleaned_balanced_df.statement[21])
pos = [(token.text, token.pos_) for token in nobj]
print(pos)

[('"', 'PUNCT'), ('The', 'DET'), ('graduation', 'NOUN'), ('rate', 'NOUN'), ('in', 'ADP'), ('New', 'PROPN'), ('York', 'PROPN'), ('City', 'PROPN'), ('has', 'AUX'), ('increased', 'VERB'), ('by', 'ADP'), ('50', 'NUM'), ('percent', 'NOUN'), ('in', 'ADP'), ('13', 'NUM'), ('years', 'NOUN'), ('"', 'PUNCT')]


In [17]:
def dot_pos_tagger(text):
    # nlp object
    nobj = nlp(text)
    # POS tags
    tagged_text = [(token.text, token.pos_) for token in nobj]
    # tagged text
    return tagged_text

cleaned_balanced_df["general_pos_tags"] = cleaned_balanced_df.statement.apply(dot_pos_tagger)

In [18]:
cleaned_balanced_df[["general_pos_tags"]][0:2]

Unnamed: 0,general_pos_tags
0,"[("", PUNCT), (Tim, PROPN), (Kaine, PROPN), (do..."
1,"[("", PUNCT), (The, DET), (deficit, NOUN), (......"


In [19]:
# a more visual rendering of the above
displacy.render(nobj, 
                style = "dep", # styles are dep (dependency) or ent (entity)
                jupyter = True,
                options = {
                          "distance" : 125,
                          "arrow_stroke" : 2, # thickness of arrow line
                          "arrow_width" : 8, # thickness of arrow head
                          "bg" : "#EDEDED" # background color
                         })

## .tag_ (specific pos tagging -- 53 tags)

In [20]:
# .tag_ example (more specific pos tagging)
nobj = nlp(cleaned_balanced_df.statement[21])
pos = [(token.text, token.tag_) for token in nobj]
print(pos)

[('"', '``'), ('The', 'DT'), ('graduation', 'NN'), ('rate', 'NN'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('City', 'NNP'), ('has', 'VBZ'), ('increased', 'VBN'), ('by', 'IN'), ('50', 'CD'), ('percent', 'NN'), ('in', 'IN'), ('13', 'CD'), ('years', 'NNS'), ('"', "''")]


In [22]:
def dot_tag_tagger(text):
    # nlp object
    nobj = nlp(text)
    # tags
    tagged_text = [(token.text, token.tag_) for token in nobj]
    # tagged text
    return tagged_text

cleaned_balanced_df["specific_pos_tags"] = cleaned_balanced_df.statement.apply(dot_tag_tagger)

In [24]:
cleaned_balanced_df[["specific_pos_tags"]][0:2]

Unnamed: 0,specific_pos_tags
0,"[("", ``), (Tim, NNP), (Kaine, NNP), (does, VBZ..."
1,"[("", ``), (The, DT), (deficit, NN), (..., :), ..."


## SD (Syntatic Dependency) tagging

In [26]:
# .tag_ example (more specific pos tagging)
nobj = nlp(cleaned_balanced_df.statement[21])
sd = [(token.text, token.dep_) for token in nobj]
print(sd)

[('"', 'punct'), ('The', 'det'), ('graduation', 'compound'), ('rate', 'nsubj'), ('in', 'prep'), ('New', 'compound'), ('York', 'compound'), ('City', 'pobj'), ('has', 'aux'), ('increased', 'ROOT'), ('by', 'prep'), ('50', 'nummod'), ('percent', 'pobj'), ('in', 'prep'), ('13', 'nummod'), ('years', 'pobj'), ('"', 'punct')]


In [28]:
# sd function
def sd_tagger(text):
    # nlp object
    nobj = nlp(text)
    # tags
    tagged_text = [(token.text, token.dep_) for token in nobj]
    # tagged text
    return tagged_text

cleaned_balanced_df["sd_tags"] = cleaned_balanced_df.statement.apply(sd_tagger)

In [30]:
cleaned_balanced_df[["sd_tags"]][0:2]

Unnamed: 0,sd_tags
0,"[("", punct), (Tim, compound), (Kaine, nsubj), ..."
1,"[("", punct), (The, det), (deficit, nsubj), (....."


In [31]:
cleaned_balanced_df.to_csv("ready_to_be_engineered.csv")