In [145]:
# psh270, jxs535, def456, hkp680

# Task 2
import numpy as np
import pandas as pd
import re
from cleantext.sklearn import CleanTransformer # likely required to ´pip install clean-text´
data = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")

In [146]:
initial_cleaner = CleanTransformer(fix_unicode=True,               # fix various unicode errors
                                    to_ascii=True,                  # transliterate to closest ASCII representation
                                    lower=True,                     # lowercase text
                                    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=True,                  # replace all URLs with a special token
                                    no_emails=True,                # replace all email addresses with a special token
                                    no_phone_numbers=True,         # replace all phone numbers with a special token
                                    no_numbers=False,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=True,      # replace all currency symbols with a special token
                                    no_punct=True,                 # remove punctuations
                                    replace_with_punct=" ",          # instead of removing punctuations you may replace them
                                    replace_with_url="<url>",
                                    replace_with_email="<email>",
                                    replace_with_phone_number="<phone>",
                                    replace_with_currency_symbol="<cur>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )

general_cleaner = CleanTransformer(fix_unicode=False,               # fix various unicode errors
                                    to_ascii=False,                  # transliterate to closest ASCII representation
                                    lower=False,                     # lowercase text
                                    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=False,                  # replace all URLs with a special token
                                    no_emails=False,                # replace all email addresses with a special token
                                    no_phone_numbers=False,         # replace all phone numbers with a special token
                                    no_numbers=True,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=False,      # replace all currency symbols with a special token
                                    no_punct=False,                 # remove punctuations
                                    replace_with_number="<number>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )



In [147]:
three_numb_date = r'(<number> <number> <number>)' #YYYY/MM/DD or DD/MM/YYYY or MM/DD/YYYY
literal_months_date= r'(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S* ((<number> ){1,2}|([0-9]{1,2}(st|nd|rd|th)))' #Eg. jun 2nd 2020, january 23. 2021
literal_months_reverse_date = r'((<number> {1,2})|[0-9]{1,2}(st|nd|rd|th)) *(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S*' #Eg. 10th february, 4th july
all_dates = (three_numb_date) +'|' + (literal_months_date) +'|'+ (literal_months_reverse_date)
multiple_chars = r'(.)\1{3,}'
special_symbols = r'([^<>a-z ])'#Matches special symbols such as © or ™

In [148]:
#string_test='In gold, the open interest SURPRISINGLY ROSE BY A CONSIDERABLE 9126 CONTRACTS UP TO582,421 WITH THE GOOD SIZED RISE IN PRICE OF GOLD WITH YESTERDAY’S TRADING ($5.55). IN ANOTHER HUGE DEVELOPMENT, WE RECEIVED THE TOTAL NUMBER OF GOLD EFP’S ISSUED FOR WEDNESDAY AND IT TOTALED A HUMONGOUS SIZED 12,223 CONTRACTS OF WHICH FEBRUARY SAW 11,023 CONTRACTS ISSUED AND APRIL SAW THE ISSUANCE OF 1200 CONTRACTS.'
#date_test  = '12/18/10 12/18/2020 12-18-10 12-18-2020 12/18/10 12/18/2020 12.18.10 12.18.2020 noise 12182010 december 18, 2010 janu 10th march 1st 3st january Dekjkj 10th  noise 10/20  noise noise 2020 10th january 2021'

def clean_column(data, col_name):
    for i, entry in enumerate(data[col_name]):    
        #We first convert to lower case and replace punctuation with space such that dates can
        #more easily be processed (eg. 10.12.2020 -> 10 12 2020 -> <NUMBER> <NUMBER> <NUMBER> instead of <NUMBER><NUMBER><DIGIT> or something)
        cleaned = initial_cleaner.transform([entry])[0]
        cleaned = general_cleaner.transform([cleaned])[0]
        cleaned = re.sub(all_dates, '<date> ', cleaned)
        cleaned = re.sub(special_symbols,'',cleaned)
        cleaned = re.sub(multiple_chars, '', cleaned)
        data.at[i, col_name] = cleaned

def clean_data(data):
    #Dropping unneeded columns
    cols_to_delete = ["Unnamed: 0","id","scraped_at","inserted_at","updated_at"]
    for column in data.columns:
        if data[column].isnull().values.all():
            cols_to_delete.append(column)

    data.drop(cols_to_delete, 1, inplace=True)
    
    clean_column(data, "content")
    clean_column(data, "title")
    return data


In [151]:
data["content"][4]

'Donald Trump has the unnerving ability to ability to create his own reality and convince millions of Americans that what he says it is true. The problem with the president lying is that he then believes his own lies. A new poll shows how that can get the country into deep trouble.\n\nThe new ABC News/Washington Post poll came out after the president’s physician gave him a physical and mental exam. The doctor gave Trump a clean bill of health, added an inch to his height, and claimed he was fit to serve for seven more years.\n\nThis poll was able to capture Americans’ opinions after a new book came out indicating that people around Trump questioned his emotional stability and ability to hold office. In addition, the new poll gave the respondents the time to hear Trump tell the public that he was a “very stable genius” before they were interviewed. He said:\n\n‘Actually, throughout my life, my two greatest assets have been mental stability and being, like, really smart.’\n\nThe ABC/Wash

Since we are working on a subset of the full dataset, there is no need to include the old index/ID.
Furthermore, since pandas adds an index column, we do not have a need for the already existing (possibly error prone) local index (Unnamed: 0).

Metadata regarding scraping, time of updates etc. do not serve a significant impact on the processing we wish to perform.

In [144]:
data = clean_data(data)
data

  data.drop(cols_to_delete, 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,domain,type,url,content,title,authors,meta_keywords,meta_description
0,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,sometimes the power of christmas will make you...,church congregation brings gift to waitresses ...,Ruth Harris,[''],
1,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,awakening of <number> strands of dna reconnect...,awakening of <number> strands of dna reconnect...,Zurich Times,[''],
2,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,never hike alone a friday the th fan film usa ...,never hike alone a friday the th fan film full...,,[''],Never Hike Alone: A Friday the 13th Fan Film ...
3,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,when a rare shark was caught scientists were l...,elusive alien of the sea caught by scientist f...,Alexander Smith,[''],
4,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,donald trump has the unnerving ability to abil...,trump s genius poll is complete the results ha...,Gloria Christie,[''],
5,blackagendareport.com,unreliable,https://blackagendareport.com/articlelist/Medi...,republicans and democrats alike are willing to...,black agenda report,"Margaret Kimberley, Bar Editor, Senior Columni...",[''],
6,awarenessact.com,conspiracy,http://awarenessact.com/tag/waking-up-in-the-m...,could you imagine waking up in the morgue i fo...,waking up in the morgue awareness act,Gerald Sinclair,[''],
7,beforeitsnews.com,fake,http://beforeitsnews.com/home/featuredlist/v2_...,citizen journalist by n morgan q has released ...,citizen journalist,,[''],
8,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2018/01/usa-d...,usa dollar tanks on mnuchin statement that he ...,usa dollar tanks on mnuchin statement that he ...,,[''],
9,canadafreepress.com,conspiracy,http://canadafreepress.com/article/its-not-rea...,subscribe to canada free press for free neithe...,it s not really president trump who needs his ...,"Judi Mcleod, Because Without America, There Is...",[''],


In [138]:
print(data["title"][4])

trump s genius poll is complete the results have americans bursting with laughter


In [139]:
data["content"][4]

'donald trump has the unnerving ability to ability to create his own reality and convince millions of americans that what he says it is true the problem with the president lying is that he then believes his own lies a new poll shows how that can get the country into deep trouble the new abc news washington post poll came out after the president s physician gave him a physical and mental exam the doctor gave trump a clean bill of health added an inch to his height and claimed he was fit to serve for seven more years this poll was able to capture americans opinions after a new book came out indicating that people around trump questioned his emotional stability and ability to hold office in addition the new poll gave the respondents the time to hear trump tell the public that he was a very stable genius before they were interviewed he said actually throughout my life my two greatest assets have been mental stability and being like really smart the abc washington post poll discovered that 

In [4]:
# Task 3
# things to explore
# 1: clickbait type vs titles
# 2: word frequency n-gram vs article type
# 3: authors changing type

In [5]:
# Task 4
group_nr = 14
group_substring_raw = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
group_substring = ""

for letter in np.sort(list(group_substring_raw)):
    group_substring += letter
    
group_substring

'AOPRSTUVWZ'

In [6]:
import requests
from bs4 import BeautifulSoup

response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text

soup = BeautifulSoup(contents, 'html.parser')

In [7]:
cont = soup.find_all('a', class_="external text")
links = []
for line in cont:
    if np.isin((line.attrs["href"][-1]), list(group_substring)):
        links.append(line.attrs["href"])

links = [links[0]] # Go through each href and scrape untill next h3 letter is hit. TODO
for link in links:
    response = requests.get(link)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')

In [22]:
cont = soup.find_all("div", id="mw-pages")