In [None]:
# psh270, jxs535, fgp424, hkp680

# Task 2
import nltk as nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('all')
import numpy as np
import pandas as pd
import re
from cleantext.sklearn import CleanTransformer # likely required to ´pip install clean-text´
data = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")

In [None]:
def drop_useless_data(data):
    #Dropping unneeded columns
    cols_to_delete = ["Unnamed: 0","id","scraped_at","inserted_at","updated_at"]
    for column in data.columns:
        if data[column].isnull().values.all():
            cols_to_delete.append(column)
    data.drop(cols_to_delete, 1, inplace=True)
    
    #Dropping entries with nan type
    data.dropna(subset = ["type"], inplace = True)
    #Dropping entries with unknown type
    data.drop(data.loc[data["type"] == "unknown"].index, inplace=True)

drop_useless_data(data)

In [None]:
type_distribution = {}
for type in data["type"]:
    if type not in type_distribution:
        type_distribution[type] = 1
    else:
        type_distribution[type] += 1
type_distribution

In [None]:
fake = type_distribution["fake"]
other = 0
for type in type_distribution:
    if type != "fake":
        other += type_distribution[type]

print(f"fake: {fake}, other: {other}")

In [None]:
def get_type_col_distribution(col, func):
    distribution = {}
    for type, val in zip(data["type"], data[col]):
        if type not in distribution:
            distribution[type] = (func(val), 1)
        else:
            cur_val, num_type = distribution[type]
            distribution[type] = cur_val + func(val), num_type + 1
    return distribution

def average_type_col_distrubtion(dist):
    avg_dist = {}
    for key in dist:
        total_val, num_type = dist[key]
        avg_val = total_val / num_type
        avg_dist[key] = avg_val
    return avg_dist

In [None]:
title_len_distribution = get_type_col_distribution("title", len)
avg_title_len_distribution = average_type_col_distrubtion(title_len_distribution)
avg_title_len_distribution

In [None]:
content_len_distribution = get_type_col_distribution("content", len)
avg_content_len_distribution = average_type_col_distrubtion(content_len_distribution)
avg_content_len_distribution

In [None]:
has_author_distribution = get_type_col_distribution("authors", lambda authors : 0 if str(authors) == "nan" else 1)
has_author_distribution

In [None]:
initial_cleaner = CleanTransformer(fix_unicode=True,               # fix various unicode errors
                                    to_ascii=True,                  # transliterate to closest ASCII representation
                                    lower=True,                     # lowercase text
                                    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=True,                  # replace all URLs with a special token
                                    no_emails=True,                # replace all email addresses with a special token
                                    no_phone_numbers=True,         # replace all phone numbers with a special token
                                    no_numbers=False,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=True,      # replace all currency symbols with a special token
                                    no_punct=True,                 # remove punctuations
                                    replace_with_punct=" ",          # instead of removing punctuations you may replace them
                                    replace_with_url="<url>",
                                    replace_with_email="<email>",
                                    replace_with_phone_number="<phone>",
                                    replace_with_currency_symbol="<cur>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )

general_cleaner = CleanTransformer(fix_unicode=False,               # fix various unicode errors
                                    to_ascii=False,                  # transliterate to closest ASCII representation
                                    lower=False,                     # lowercase text
                                    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=False,                  # replace all URLs with a special token
                                    no_emails=False,                # replace all email addresses with a special token
                                    no_phone_numbers=False,         # replace all phone numbers with a special token
                                    no_numbers=True,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=False,      # replace all currency symbols with a special token
                                    no_punct=False,                 # remove punctuations
                                    replace_with_number="<number>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )



In [None]:
def WordFreq(col_name, article_number, input):
    q = input[col_name][article_number]
    unique_words = set(q)
    unique_word_count = len(unique_words)
    qqq = len(q)/unique_word_count
    return qqq

def WordFreqSet(set_value , set_data):
    WordFreqArray = []
    for col in set_data.index:
        WordFreqArray.append(WordFreq(set_value, col, set_data))
    return WordFreqArray

In [None]:
three_numb_date = r'(<number> <number> <number>)' #YYYY/MM/DD or DD/MM/YYYY or MM/DD/YYYY
literal_months_date= r'(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S* ((<number> ){1,2}|([0-9]{1,2}(st|nd|rd|th)))' #Eg. jun 2nd 2020, january 23. 2021
literal_months_reverse_date = r'((number {1,2})|[0-9]{1,2}(st|nd|rd|th)) *(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S*' #Eg. 10th february, 4th july
all_dates = (three_numb_date) +'|' + (literal_months_date) +'|'+ (literal_months_reverse_date)
multiple_chars = r'(.)\1{3,}'
special_symbols = r'([^<>a-z ])'#Matches special symbols such as © or ™

In [None]:
# switch tags, compatibility with lemmatise() 
def switchTag(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif (tag.startswith('J') or
            tag.startswith('A')):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

#string_test='In gold, the open interest SURPRISINGLY ROSE BY A CONSIDERABLE 9126 CONTRACTS UP TO582,421 WITH THE GOOD SIZED RISE IN PRICE OF GOLD WITH YESTERDAY’S TRADING ($5.55). IN ANOTHER HUGE DEVELOPMENT, WE RECEIVED THE TOTAL NUMBER OF GOLD EFP’S ISSUED FOR WEDNESDAY AND IT TOTALED A HUMONGOUS SIZED 12,223 CONTRACTS OF WHICH FEBRUARY SAW 11,023 CONTRACTS ISSUED AND APRIL SAW THE ISSUANCE OF 1200 CONTRACTS.'
#date_test  = '12/18/10 12/18/2020 12-18-10 12-18-2020 12/18/10 12/18/2020 12.18.10 12.18.2020 noise 12182010 december 18, 2010 janu 10th march 1st 3st january Dekjkj 10th  noise 10/20  noise noise 2020 10th january 2021'

def clean_column(data, col_name):
    for i, entry in zip(data[col_name].index, data[col_name]):    
        #We first convert to lower case and replace punctuation with space such that dates can
        #more easily be processed (eg. 10.12.2020 -> 10 12 2020 -> <NUMBER> <NUMBER> <NUMBER> instead of <NUMBER><NUMBER><DIGIT> or something)
        cleaned = initial_cleaner.transform([entry])[0]
        cleaned = general_cleaner.transform([cleaned])[0]
        cleaned = re.sub(all_dates, '<date> ', cleaned)
        cleaned = re.sub(special_symbols,'',cleaned)
        cleaned = re.sub(multiple_chars, '', cleaned)
        data.at[i, col_name] = cleaned

def clean_data(data):
    clean_column(data, "content")
    clean_column(data, "title")
    return data

# Give n articles returns an array of n arrays. 
# Each containing vocalbulary for a paticular article
def lemmatise_column(data, col_name):
    retVal = []
    for entry in data[col_name] : 
        buff = entry.replace('<date>','')
        buff = buff.replace('<number>','')
        buff = word_tokenize(buff)
        buff = nltk.pos_tag(buff)
        lemmatizer = WordNetLemmatizer()
        lemmatised=[]
        for index,word in enumerate(buff):
            buff[index] = (word[0],switchTag(word[1]))
            lemmatised.append(lemmatizer.lemmatize(buff[index][0],pos=buff[index][1]))
        retVal.append(lemmatised)
    return retVal

def count_and_clean(data,col_name):
    numbers=0
    dates=0
    urls=0
    emails=0
    phone_num=0
    currencies=0
    last_length = 0
    tags = ["<number>","<number>"]
    for i, entry in zip(data[col_name].index, data[col_name]):
        last_length = len(entry)
        cleaned = entry.replace('<number>'," ")
        numbers = last_length-len(cleaned)
        last_length = len(cleaned)

        cleaned = entry.replace('<date>'," ")
        dates = last_length-len(cleaned)
        last_length = len(cleaned)

        cleaned = entry.replace('<url>'," ")
        urls = last_length-len(cleaned)
        last_length = len(cleaned)

        cleaned = entry.replace('<email>'," ")
        emails = last_length-len(cleaned)
        last_length = len(cleaned)

        cleaned = entry.replace('<number>'," ")
        numbers = last_length-len(cleaned)
        last_length = len(cleaned)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
wfs = WordFreqSet("content", data)
data = clean_data(data)
# count dates and number 
df = pd.DataFrame(data, columns=['domain','type','url','content','title','authors', 'meta_keywords', 'meta_description', 'tags'])
data = lemmatise_column(data, "content")
wfs = WordFreqSet("content", data)
print(wfs)
#df2 = df.assign(WordFreq = wfs)


In [None]:
lematised_data = lemmatise(data, "content")

#Freq Distribution for all articles
dsitr = FreqDist(lemmatised_data)

#

Task 3

We have observed that all fake news articles have been distributed by beforeitsnews.com.

Besides that we have observed that no fake news articles contains meta_keywords or meta_description.

We also observed that Conspiracy, hate and clickbait articles have very high author distribution. 

Hate articles tends to have longer sentences most likely due to rants.

Reliable data seems to lower word length count per sentence and longer sentence length



In [None]:
### Task 4
# Initialize Group SubString
group_nr = 14
group_substring_raw = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
group_substring = ""

for letter in np.sort(list(group_substring_raw)):
    group_substring += letter
    
print(group_substring)

In [None]:
import requests
from bs4 import BeautifulSoup

# Get main page and add subpage (according to group_substring) urls to list
response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text

soup = BeautifulSoup(contents, 'html.parser')

subpages = []
for a in soup.find_all('a', href=True):
    for letter in group_substring:
        if "conflicts&from="+letter in a["href"]:
            if not a["href"] in subpages:
                subpages.append(a["href"])  

In [None]:
# Creating a list of articles on THE FIRST PAGE ONLY, NEED TO FIX LATER
Articles = []
for url in subpages:
    response = requests.get(url)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')

    allGroups = soup.find_all("div",attrs={"class":"mw-category-group"})
    for n in allGroups:
        if "<h3>"+url[-1]+"</h3>" in str(n) and "<ul><li><a" in str(n):
            pages = n
            break
    ul = re.findall('\/wiki.*(?=title)',str(pages))
    for i in range(len(ul)):
        ul[i] = "https://en.wikinews.org" + ul[i][:-2]

    Articles.append(ul)

In [None]:
def GrabArticle(url):
    response = requests.get(url)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')
    
    try: # Need to convert from "MonthName Day, Year" to "Year-Month-Day"
        date = re.findall('[1-9]+.*[1-9]',str(soup.find("span", attrs={"id":"publishDate"})))[0]
    except:
        try: 
            date = re.findall('[A-Z][a-z]+ [0-9]+, [0-9][0-9][0-9][0-9]',str(soup.find("div", attrs={"class":"mw-parser-output"})))[0]
        except:
            date = "NaN"
    try:
        text = soup.get_text() #currently displays EVERYTHING on page, needs work
    except:
        text = "NaN"

    
    #srcs section needs work
    srcs = []
    try:
        src = soup.find_all("span",attrs={"class":"sourceTemplate"})
        for n in src:
            srcs.append(n)
    except:
        src = "NaN"

    return date#,srcs,text

In [None]:
# Scraping article pages for data and adding to lists for dataframe creation
urls = []
dates = []
for articles in Articles:
    for url in articles:
        urls.append(url)
        dates.append(GrabArticle(url))
        
        
Task4df = pd.DataFrame(data = {"URL" : urls, "Date written" : dates})