In [312]:
# psh270, abc123, def456

# Task 2
import numpy as np
import pandas as pd
import re
from cleantext.sklearn import CleanTransformer
data = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")

In [313]:
initial_cleaner = CleanTransformer(fix_unicode=True,               # fix various unicode errors
                                    to_ascii=True,                  # transliterate to closest ASCII representation
                                    lower=True,                     # lowercase text
                                    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=True,                  # replace all URLs with a special token
                                    no_emails=True,                # replace all email addresses with a special token
                                    no_phone_numbers=True,         # replace all phone numbers with a special token
                                    no_numbers=False,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=True,      # replace all currency symbols with a special token
                                    no_punct=True,                 # remove punctuations
                                    replace_with_punct=" ",          # instead of removing punctuations you may replace them
                                    replace_with_url="<url>",
                                    replace_with_email="<email>",
                                    replace_with_phone_number="<phone>",
                                    replace_with_currency_symbol="<cur>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )

general_cleaner = CleanTransformer(fix_unicode=False,               # fix various unicode errors
                                    to_ascii=False,                  # transliterate to closest ASCII representation
                                    lower=False,                     # lowercase text
                                    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=False,                  # replace all URLs with a special token
                                    no_emails=False,                # replace all email addresses with a special token
                                    no_phone_numbers=False,         # replace all phone numbers with a special token
                                    no_numbers=True,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=False,      # replace all currency symbols with a special token
                                    no_punct=False,                 # remove punctuations
                                    replace_with_number="<number>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )



In [314]:
three_numb_date = r'(<number> <number> <number>)' #YYYY/MM/DD or DD/MM/YYYY or MM/DD/YYYY
literal_months_date= r'(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S* ((<number> ){1,2}|([0-9]{1,2}(st|nd|rd|th)))' #Eg. jun 2nd 2020, january 23. 2021
literal_months_reverse_date = r'((<number> {1,2})|[0-9]{1,2}(st|nd|rd|th)) *(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S*' #Eg. 10th february, 4th july
all_dates = (three_numb_date) +'|' + (literal_months_date) +'|'+ (literal_months_reverse_date)

multiple_chars = r'(.)\1{3,}'

In [315]:
#string_test='In gold, the open interest SURPRISINGLY ROSE BY A CONSIDERABLE 9126 CONTRACTS UP TO582,421 WITH THE GOOD SIZED RISE IN PRICE OF GOLD WITH YESTERDAY’S TRADING ($5.55). IN ANOTHER HUGE DEVELOPMENT, WE RECEIVED THE TOTAL NUMBER OF GOLD EFP’S ISSUED FOR WEDNESDAY AND IT TOTALED A HUMONGOUS SIZED 12,223 CONTRACTS OF WHICH FEBRUARY SAW 11,023 CONTRACTS ISSUED AND APRIL SAW THE ISSUANCE OF 1200 CONTRACTS.'
#date_test  = '12/18/10 12/18/2020 12-18-10 12-18-2020 12/18/10 12/18/2020 12.18.10 12.18.2020 noise 12182010 december 18, 2010 janu 10th march 1st 3st january Dekjkj 10th  noise 10/20  noise noise 2020 10th january 2021'
def clean_data(data):
    #Dropping unneeded columns
    cols_to_delete = ["Unnamed: 0","id","scraped_at","inserted_at","updated_at"]
    for column in data.columns:
        if data[column].isnull().values.all():
            cols_to_delete.append(column)

    data.drop(cols_to_delete, 1,inplace=True)
    
    #match = re.compile(all_dates)
    for i, content in enumerate(data["content"]):    
        #We first convert to lower case and replace punctuation with space such that dates can
        #more easily be processed (eg. 10.12.2020 -> 10 12 2020 -> <NUMBER> <NUMBER> <NUMBER> instead of <NUMBER><NUMBER><DIGIT> or something)
        new_content = initial_cleaner.transform([content])[0]
        new_content = general_cleaner.transform([new_content])[0]
        new_content = re.sub(all_dates, '<date> ', new_content)
        #all_matches = match.findall(new_content)
        data.at[i, "content"] = new_content
    return data


In [316]:
data.at[0, "content"]

'Sometimes the power of Christmas will make you do wild and wonderful things. You do not need to believe in the Holy Trinity to believe in the positive power of doing good for others. The simple act of giving without receiving is lost on many of us these days, as worries about money and success hold us back from giving to others who are in need. One congregation in Ohio was moved to action by the power of a sermon given at their church on Christmas Eve. The pastor at Grand Lake United Methodist Church in Celina, Ohio gave an emotional sermon about the importance of understanding the message of Jesus.\n\nFor many religious people the message of Jesus is to help others before yourself, to make sure the people who are suffering get the help they need to enjoy life a little bit. The sermon was really about generosity and what that can look like in our lives. Jesus lived a long time ago and he acted generously in the fashion of his time – but what would a generous act look like in our times

Since we are working on a subset of the full dataset, there is no need to include the old index/ID.
Furthermore, since pandas adds an index column, we do not have a need for the already existing (possibly error prone) local index (Unnamed: 0).

Metadata regarding scraping, time of updates etc. do not serve a significant impact on the processing we wish to perform.

In [317]:
data = clean_data(data[:10])
data

  data.drop(cols_to_delete, 1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,domain,type,url,content,title,authors,meta_keywords,meta_description
0,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,sometimes the power of christmas will make you...,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,[''],
1,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,awakening of <number> strands of dna reconnect...,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,[''],
2,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,never hike alone a friday the 13th fan film us...,Never Hike Alone - A Friday the 13th Fan Film ...,,[''],Never Hike Alone: A Friday the 13th Fan Film ...
3,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,when a rare shark was caught scientists were l...,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,[''],
4,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,donald trump has the unnerving ability to abil...,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,[''],
5,blackagendareport.com,unreliable,https://blackagendareport.com/articlelist/Medi...,republicans and democrats alike are willing to...,Black Agenda Report,"Margaret Kimberley, Bar Editor, Senior Columni...",[''],
6,awarenessact.com,conspiracy,http://awarenessact.com/tag/waking-up-in-the-m...,could you imagine waking up in the morgue i fo...,waking up in the morgue – Awareness Act,Gerald Sinclair,[''],
7,beforeitsnews.com,fake,http://beforeitsnews.com/home/featuredlist/v2_...,citizen journalist by n morgan q has released ...,Citizen Journalist,,[''],
8,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2018/01/usa-d...,usa dollar tanks on mnuchin statement that he ...,Usa Dollar Tanks On Mnuchin Statement That He ...,,[''],
9,canadafreepress.com,conspiracy,http://canadafreepress.com/article/its-not-rea...,subscribe to canada free press for free neithe...,It’s Not Really President Trump Who Needs His ...,"Judi Mcleod, Because Without America, There Is...",[''],


In [318]:
print(data["content"][9])

subscribe to canada free press for free neither rev rodriguez nor pope francis will be there but president trump will be addressing the crowd by satellite it s not really president trump who needs his mouth washed out with soap throwing the spotlight on so called men of the cloth it s not president donald trump who needs to have his mouth washed out with soap for having allegedly made use of the vulgar s word at a white house meeting on immigration there s just no getting around it that some words are more painful than cuss words particularly when they heap praise on abortion advocates members of president trump s evangelical advisory council decided to jump ship for the day and publicly sided with nancy pelosi on daca redstate <date> you have always taken the lead your commitment to the immigrant community to the dreamers is second to none so america is in a better place because of your prophetic leadership on this matter the rev samuel rodriguez who spoke at trump s inauguration last

In [4]:
# Task 3

In [5]:
# Task 4
group_nr = 14
group_substring_raw = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
group_substring = ""

for letter in np.sort(list(group_substring_raw)):
    group_substring += letter
    
group_substring

'AOPRSTUVWZ'

In [6]:
import requests
from bs4 import BeautifulSoup

response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text

soup = BeautifulSoup(contents, 'html.parser')

In [7]:
cont = soup.find_all('a', class_="external text")
links = []
for line in cont:
    if np.isin((line.attrs["href"][-1]), list(group_substring)):
        links.append(line.attrs["href"])

links = [links[0]] # Go through each href and scrape untill next h3 letter is hit. TODO
for link in links:
    response = requests.get(link)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')

In [22]:
cont = soup.find_all("div", id="mw-pages")