In [None]:
from datetime import datetime
import urllib3
import numpy as np
import seaborn
import matplotlib.pyplot as plt
import re
import pandas as pd
import json
from bs4 import BeautifulSoup
from collections import Counter

In [None]:
# Timestamp @ Crawl
datePosted = str(datetime.today())
print('Time of Crawl: ' + datePosted)

In [None]:
# To do in future: Certificate verification, streaming
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 

# FactCheck
http = urllib3.PoolManager()
url = 'https://www.factcheck.org/fake-news/'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)

# Get Last Page
pages = [i.text for i in soup.find_all('a') if 'fake-news/page/' in str(i)]
lastpage = pages[-1]

# Actual Scrap
adate = []
description = []
short_desc = []
dataurl = []
verdict = []

for cp in np.arange(1,int(lastpage)+1):
    url = 'https://www.factcheck.org/fake-news/page/' + str(cp)
    response = http.request('GET', url)
    soup = BeautifulSoup(response.data)
    adate.append([i.text for i in soup.findAll('div', attrs={'class':'entry-meta'})])
    # dataurl.append([i.get('data-url') for i in soup.findAll('div', attrs={'class':'at-above-post-cat-page addthis_tool', 'data-url': True})])
    short_desc.append([i.text for i in soup.findAll('h3', attrs={'class':'entry-title'})])
    description.append([i.p.text for i in soup.findAll('div', attrs={'class':'entry-content'})]) 
    urldiv = soup.findAll('div', attrs={'class':'col-12 col-sm-4'})
    for div in urldiv:
        dataurl.append(div.find('a')['href'])
print('Initial Scrap Completed.')

# Get Verdict Within Each Article's Embedded URL
for link in dataurl:
        response = http.request('GET', link)
        soup = BeautifulSoup(response.data)
        verdict.append([row.text for row in soup.findAll('div',attrs={'style':'font-weight: bold'})])
print('Verdicts Retrieved.')

# Flatten List [Output of Previous Section were Lists]
descriptionflat = [y for x in description for y in x]
short_descflat = [y for x in short_desc for y in x]
dateflat = [y for x in adate for y in x]
verdictflat = [y for x in verdict for y in x]
formatdate = [datetime.strptime(re.sub('\n', '', re.sub('\t', '', a)), '%B %d, %Y') for a in dateflat]

# CSV
compiled_data = {'brief':short_descflat,'description':descriptionflat,'verdict':verdictflat,'date':formatdate,'url':dataurl}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in compiled_data.items()]))
df.to_csv('./Downloads/DI/FactCheck.csv',header=True,index=False)
print('Saved as CSV.')

In [None]:
# Truth of Fiction

url = 'https://www.truthorfiction.com/category/fact-checks/politics/'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)

# Get Last Page
pages = [i.text for i in soup.find_all('a') if 'fact-checks/politics/page/' in str(i)]
lastpage = pages[-2]

# Actual Scrap
adate = []
description = []
short_desc = []
dataurl = []
verdict = []
alt = []

for cp in np.arange(1,int(lastpage)+1):
    tmpurl = []
    url = 'https://www.truthorfiction.com/category/fact-checks/politics/' + str(cp)
    response = http.request('GET', url)
    soup = BeautifulSoup(response.data)
    adate.append([i.text for i in soup.findAll('span', class_='tt-post-date')])
    tmpurl.append([i.get('href') for i in soup.findAll('a', {'class':'tt-post-title c-h5', 'href':True})])
    dataurl.append(tmpurl[0][:-1])
    short_desc.append([i.text for i in soup.findAll('h3', attrs={'class':'entry-title'})])
    description.append([i.p.text for i in soup.findAll('div', attrs={'class':'simple-text'})])  
print('Initial Scrap Completed.')

# Flatten List [Output of Previous Section were Lists]
descriptionflat = [y for x in description for y in x]
short_descflat = [y for x in short_desc for y in x]
dateflat = [y for x in adate for y in x]
dataurlflat = [y for x in dataurl for y in x]
formatdate = [datetime.strptime(re.sub('\n', '', re.sub('\t', '', a)), '%B %d, %Y') for a in dateflat]

# Get Verdict Within Each Article's Embedded URL
for link in dataurlflat:
    response = http.request('GET', link)
    soup = BeautifulSoup(response.data)
    try:
        tmp = json.loads(soup.find_all('script', type='application/ld+json')[1].text)
        verdict.append(tmp['reviewRating']['ratingValue'])
        alt.append(tmp['reviewRating']['alternateName'])
    except:
        tmp = json.loads(soup.find_all('script', type='application/ld+json')[0].text)
        verdict.append(tmp['reviewRating']['ratingValue'])
        alt.append(tmp['reviewRating']['alternateName'])
print('Verdicts Retrieved.')


# CSV
compiled_data = {'brief':short_descflat,'description':descriptionflat,'verdict':verdict,'altName':alt,'date':formatdate,'url':dataurlflat}
df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in compiled_data.items()]))
df.to_csv('./Downloads/DI/TorF.csv',header=True,index=False)
print('Saved as CSV.')

In [None]:
# Do Exploratory Analyses

df1 = pd.read_csv('./Downloads/DI/FactCheck.csv')

In [None]:
df1.head()

In [None]:
df3 = pd.read_csv('./Downloads/DI/users.csv')
df4 = pd.read_csv('./Downloads/DI/tweets.csv')
print(df3.head())
print(df4.head())

In [None]:
df4c = df4[df4['retweet_count'].notnull()]

In [None]:
df_comb = pd.merge(df3,df4c,left_on='id',right_on='user_id')
print(df_comb.columns)

In [None]:
# Tweets Per User

# time_format ='%m/%d/%Y %H:%M:%S'
time_format ='%Y/%m/%d %H:%M:%S'
df_comb['created_str'] = pd.to_datetime(df_comb['created_str'],format = time_format)

In [None]:
df_test = df_comb
df_test['tweet_ct'] = np.ones((len(df_test),1))
df_test = df_test.set_index('created_str')
# GB = df_test.groupby([(df_test.index.year),(df_test.index.month)]).sum()
# GB.plot('abc','xyz',kind='scatter')
# GB.index
# seaborn.heatmap(x)

In [None]:
ntweets = Counter(df_comb['id'])
unpk = ntweets.most_common()[:-1]

followers = []
tweets = []
for top in unpk:
    fc = (df_comb.loc[df_comb['id'] == top[0],'followers_count'])
    tc = top[1]
    followers.append(fc.iloc[0])
    tweets.append(tc)
    
df_work = pd.DataFrame({'Tweet Count': tweets, 'Follower Count':followers })
df_work.dropna()

plt.scatter(df_work['Tweet Count'], df_work['Follower Count'])
df_work['Tweet Count'].corr(df_work['Follower Count'])