In [1]:
from bs4 import BeautifulSoup
import urllib.request,sys,time
import requests
import pandas as pd
import re
import string
import numpy as np
import nltk
from math import log10
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Main logic

#Total number of pages  to crawl
pagesToGet = 10

frame = []
upperframe = []

#The main loop where each page consists of the 30 latest articles
for pageNo in range(1,pagesToGet+1):
    
    url = 'https://www.politifact.com/factchecks/list/?page='+str(pageNo)+'&ruling=true'
    
    #  Checking whether the link is valid or not
    try:
        page=requests.get(url)      
    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()      

        print ('ERROR FOR LINK:',url)

        print (error_type, 'Line:', error_info.tb_lineno)
        continue
    
    # Getting the entire page's content in html
    soup = BeautifulSoup(page.text, "html.parser")
    # Retreiving only the links of articles from the page
    links=soup.find_all('li',attrs={'class':'o-listicle__item'})

    print("Processing page "+str(pageNo)+"...")

    #Navigating through each article link found in the current page
    for j in links:
#         Statement = j.find("div",attrs={'class':'m-statement__quote'}).text.strip()
        Link = "https://www.politifact.com"
        Link += j.find("div",attrs={'class':'m-statement__quote'}).find('a')['href'].strip()
        
        # For each article link, we crawl the entire article's page to get content
        try:
            page=requests.get(Link)      
        except Exception as e:
            print(e)
        
        soup = BeautifulSoup(page.text, "html.parser")
        Title=soup.find('h2',attrs={'class':'c-title c-title--subline'}).text.strip()
        ArticleTemp=soup.find('article',attrs={'class':'m-textblock'})
        Article = soup.find_all('p')
        fullArticle = []
        
        for i in range(2,len(Article)):
            fullArticle.append(Article[i].text.strip())
        fullArticle = " ".join(fullArticle)
        
        Date = j.find('div',attrs={'class':'m-statement__body'}).find('footer').text.split("•")[1]
        Date = Date.replace("\n", "")
        Source = j.find('div', attrs={'class':'m-statement__meta'}).find('a').text.strip()
#         Label = j.find('div', attrs ={'class':'m-statement__content'}).find('img',attrs={'class':'c-image__original'}).get('alt').strip()
        frame.append([Title, fullArticle, Link,Date,Source])
    
    print("Done with page "+str(pageNo)+"!")
    upperframe.extend(frame)

Processing page 1...
Done with page 1!
Processing page 2...
Done with page 2!
Processing page 3...
Done with page 3!
Processing page 4...
Done with page 4!
Processing page 5...
Done with page 5!
Processing page 6...
Done with page 6!
Processing page 7...
Done with page 7!
Processing page 8...
Done with page 8!
Processing page 9...
Done with page 9!
Processing page 10...
Done with page 10!


In [3]:
# Data containing Title, Article and other meta data
data=pd.DataFrame(upperframe, columns=['Title', 'Article', 'Link','Date','Source'])
data.head()

Unnamed: 0,Title,Article,Link,Date,Source
0,COVID-19 fatalities in the Rio Grande Valley d...,Health workers prepare to administer a COVID-1...,https://www.politifact.com/factchecks/2020/oct...,"October 16, 2020",Vicente Gonzalez
1,Roy Cooper compares Georgia's COVID-19 numbers...,North Carolina Gov. Roy Cooper speaks during a...,https://www.politifact.com/factchecks/2020/oct...,"October 15, 2020",Roy Cooper
2,Fact-checking Jim Justice on West Virginia’s v...,West Virginia Gov. Jim Justice prepares for a ...,https://www.politifact.com/factchecks/2020/oct...,"October 15, 2020",Jim Justice
3,"Yes, Wisconsin legislators haven’t passed a bi...",The Wisconsin Legislature last passed a bill i...,https://www.politifact.com/factchecks/2020/oct...,"October 7, 2020",Facebook posts
4,FBI director warned about white supremacist vi...,Democratic presidential candidate former Vice ...,https://www.politifact.com/factchecks/2020/oct...,"October 6, 2020",Joe Biden


In [4]:
len(data)

1650

In [5]:
#Stemming function
def stemming(token):
  if (token in set(nltk.corpus.stopwords.words('english'))):
    return token
  return nltk.stem.PorterStemmer().stem(token)

In [6]:
# Tokenize function
def text_tokenizer(text):
  tokens = nltk.word_tokenize(text)
  stemmed_tokens = (stemming(token) for token in tokens) #stemming and removing stop words
  return list([token for token in stemmed_tokens if token.isalnum()]) #removes tokens that are neither alphabetic characters nor digits

In [7]:
#Article is our main content. So indexing is done for them.
corpus = data["Article"].tolist()

In [8]:
count_vector = CountVectorizer(stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english'))), tokenizer = text_tokenizer)
tokenized_documents = count_vector.fit_transform(corpus)
term_document_matrix = pd.DataFrame(tokenized_documents.toarray(), columns = count_vector.get_feature_names())

In [9]:
#Indexing
cols = sorted(list(term_document_matrix.columns))
data = []
for col in cols:
  l = term_document_matrix[col].loc[term_document_matrix[col]>0].index.to_list()
  data.append([col, len(l), l])
inverted_index = pd.DataFrame(data ,columns=["Term","Doc_freq","Postings_List"])
print("Inverted index\n",inverted_index)

Inverted index
             Term  Doc_freq                                      Postings_List
0              0         9     [80, 140, 230, 350, 500, 680, 890, 1130, 1400]
1             07        10  [11, 41, 101, 191, 311, 461, 641, 851, 1091, 1...
2              1       605  [2, 3, 4, 5, 7, 9, 11, 15, 16, 18, 23, 24, 26,...
3             10       738  [2, 3, 6, 8, 9, 10, 12, 13, 15, 17, 18, 23, 24...
4            100       207  [5, 16, 18, 19, 28, 35, 46, 48, 49, 58, 80, 84...
5           1000         1                                             [1646]
6            101         2                                       [1341, 1611]
7           1029         4                            [815, 1025, 1265, 1535]
8            103         2                                       [1320, 1590]
9           1038        10   [3, 33, 93, 183, 303, 453, 633, 843, 1083, 1353]
10          1040         1                                             [1647]
11           108        15  [10, 40, 100, 190, 3