In [292]:
# psh270, jxs535, fgp424, hkp680

# Task 2
import nltk as nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
import numpy as np
import pandas as pd
import re
from cleantext.sklearn import CleanTransformer # likely required to ´pip install clean-text´
data = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading corpus: Package 'corpus' not found in index


In [293]:
def drop_useless_data(data):
    #Dropping unneeded columns
    cols_to_delete = ["Unnamed: 0","id","scraped_at","inserted_at","updated_at"]
    for column in data.columns:
        if data[column].isnull().values.all():
            cols_to_delete.append(column)
    data.drop(cols_to_delete, 1, inplace=True)
    
    #Dropping entries with nan type
    data.dropna(subset = ["type"], inplace = True)
    #Dropping entries with unknown type
    data.drop(data.loc[data["type"] == "unknown"].index, inplace=True)

drop_useless_data(data)

  data.drop(cols_to_delete, 1, inplace=True)


In [294]:
type_distribution = {}
for type in data["type"]:
    if type not in type_distribution:
        type_distribution[type] = 1
    else:
        type_distribution[type] += 1
type_distribution

{'unreliable': 6,
 'fake': 155,
 'clickbait': 1,
 'conspiracy': 31,
 'reliable': 3,
 'bias': 6,
 'hate': 1,
 'junksci': 6,
 'political': 23}

In [295]:
fake = type_distribution["fake"]
other = 0
for type in type_distribution:
    if type != "fake":
        other += type_distribution[type]

print(f"fake: {fake}, other: {other}")

fake: 155, other: 77


In [296]:
def get_type_col_distribution(col, func):
    distribution = {}
    for type, val in zip(data["type"], data[col]):
        if type not in distribution:
            distribution[type] = (func(val), 1)
        else:
            cur_val, num_type = distribution[type]
            distribution[type] = cur_val + func(val), num_type + 1
    return distribution

def average_type_col_distrubtion(dist):
    avg_dist = {}
    for key in dist:
        total_val, num_type = dist[key]
        avg_val = total_val / num_type
        avg_dist[key] = avg_val
    return avg_dist

In [297]:
title_len_distribution = get_type_col_distribution("title", len)
avg_title_len_distribution = average_type_col_distrubtion(title_len_distribution)
avg_title_len_distribution

{'unreliable': 61.666666666666664,
 'fake': 61.33548387096774,
 'clickbait': 83.0,
 'conspiracy': 50.61290322580645,
 'reliable': 52.333333333333336,
 'bias': 58.666666666666664,
 'hate': 24.0,
 'junksci': 72.5,
 'political': 61.65217391304348}

In [298]:
content_len_distribution = get_type_col_distribution("content", len)
avg_content_len_distribution = average_type_col_distrubtion(content_len_distribution)
avg_content_len_distribution

{'unreliable': 1900.8333333333333,
 'fake': 4105.096774193548,
 'clickbait': 1707.0,
 'conspiracy': 4966.580645161291,
 'reliable': 2611.6666666666665,
 'bias': 2736.5,
 'hate': 334.0,
 'junksci': 2382.1666666666665,
 'political': 3992.4347826086955}

In [299]:
has_author_distribution = get_type_col_distribution("authors", lambda authors : 0 if str(authors) == "nan" else 1)
has_author_distribution

{'unreliable': (5, 6),
 'fake': (104, 155),
 'clickbait': (1, 1),
 'conspiracy': (30, 31),
 'reliable': (0, 3),
 'bias': (4, 6),
 'hate': (1, 1),
 'junksci': (2, 6),
 'political': (18, 23)}

In [300]:
initial_cleaner = CleanTransformer(fix_unicode=True,               # fix various unicode errors
                                    to_ascii=True,                  # transliterate to closest ASCII representation
                                    lower=True,                     # lowercase text
                                    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=True,                  # replace all URLs with a special token
                                    no_emails=True,                # replace all email addresses with a special token
                                    no_phone_numbers=True,         # replace all phone numbers with a special token
                                    no_numbers=False,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=True,      # replace all currency symbols with a special token
                                    no_punct=True,                 # remove punctuations
                                    replace_with_punct=" ",          # instead of removing punctuations you may replace them
                                    replace_with_url="<url>",
                                    replace_with_email="<email>",
                                    replace_with_phone_number="<phone>",
                                    replace_with_currency_symbol="<cur>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )

general_cleaner = CleanTransformer(fix_unicode=False,               # fix various unicode errors
                                    to_ascii=False,                  # transliterate to closest ASCII representation
                                    lower=False,                     # lowercase text
                                    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                                    no_urls=False,                  # replace all URLs with a special token
                                    no_emails=False,                # replace all email addresses with a special token
                                    no_phone_numbers=False,         # replace all phone numbers with a special token
                                    no_numbers=True,               # replace all numbers with a special token
                                    no_digits=False,                # replace all digits with a special token
                                    no_currency_symbols=False,      # replace all currency symbols with a special token
                                    no_punct=False,                 # remove punctuations
                                    replace_with_number="<number>",
                                    lang="en"                       # set to 'de' for German special handling
                                    )



In [301]:
def WordFreq(col_name, article_number):
    q = data[col_name][article_number]
    qq = word_tokenize(q)
    unique_words = set(qq)
    unique_word_count = len(unique_words)
    qqq = len(q)/unique_word_count
    return qqq

print(WordFreq("content", 1))
def WordFreqSet(set_value , set_data):
    WordFreqArray = []
    q = set_data.shape[0]
    for col in set_data.index:
        WordFreqArray.append(WordFreq(set_value, col))
    return WordFreqArray
print(WordFreqSet("content", data))

9.101449275362318
[11.804255319148936, 9.101449275362318, 9.859934853420196, 10.782287822878228, 9.430939226519337, 6.0, 5.742857142857143, 6.428571428571429, 15.995495495495495, 11.822327044025156, 16.725090036014407, 7.371428571428571, 9.903225806451612, 12.92156862745098, 5.566666666666666, 11.116222760290556, 5.206896551724138, 10.858181818181817, 13.167420814479637, 7.930769230769231, 10.84920634920635, 9.7875, 9.802721088435375, 5.136363636363637, 8.455089820359282, 8.426229508196721, 13.222527472527473, 7.208333333333333, 9.057803468208093, 11.95357833655706, 9.1875, 8.738970588235293, 11.480243161094226, 10.67663043478261, 10.49546827794562, 13.12751677852349, 11.216216216216216, 9.529680365296803, 11.266917293233083, 17.268365817091453, 10.872641509433961, 5.5, 18.973826714801444, 13.551369863013699, 7.443396226415095, 8.497175141242938, 12.66594827586207, 14.68365553602812, 11.785106382978723, 12.743872549019608, 16.637447823494334, 10.975806451612904, 11.851985559566787, 12.

In [302]:
three_numb_date = r'(<number> <number> <number>)' #YYYY/MM/DD or DD/MM/YYYY or MM/DD/YYYY
literal_months_date= r'(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S* ((<number> ){1,2}|([0-9]{1,2}(st|nd|rd|th)))' #Eg. jun 2nd 2020, january 23. 2021
literal_months_reverse_date = r'((<number> {1,2})|[0-9]{1,2}(st|nd|rd|th)) *(jan|feb|mar|apr|may|jun|jul|aug|sep|nov|dec)\S*' #Eg. 10th february, 4th july
all_dates = (three_numb_date) +'|' + (literal_months_date) +'|'+ (literal_months_reverse_date)
multiple_chars = r'(.)\1{3,}'
special_symbols = r'([^<>a-z ])'#Matches special symbols such as © or ™

In [303]:
#string_test='In gold, the open interest SURPRISINGLY ROSE BY A CONSIDERABLE 9126 CONTRACTS UP TO582,421 WITH THE GOOD SIZED RISE IN PRICE OF GOLD WITH YESTERDAY’S TRADING ($5.55). IN ANOTHER HUGE DEVELOPMENT, WE RECEIVED THE TOTAL NUMBER OF GOLD EFP’S ISSUED FOR WEDNESDAY AND IT TOTALED A HUMONGOUS SIZED 12,223 CONTRACTS OF WHICH FEBRUARY SAW 11,023 CONTRACTS ISSUED AND APRIL SAW THE ISSUANCE OF 1200 CONTRACTS.'
#date_test  = '12/18/10 12/18/2020 12-18-10 12-18-2020 12/18/10 12/18/2020 12.18.10 12.18.2020 noise 12182010 december 18, 2010 janu 10th march 1st 3st january Dekjkj 10th  noise 10/20  noise noise 2020 10th january 2021'

def clean_column(data, col_name):
    for i, entry in enumerate(data[col_name]):    
        #We first convert to lower case and replace punctuation with space such that dates can
        #more easily be processed (eg. 10.12.2020 -> 10 12 2020 -> <NUMBER> <NUMBER> <NUMBER> instead of <NUMBER><NUMBER><DIGIT> or something)
        cleaned = initial_cleaner.transform([entry])[0]
        cleaned = general_cleaner.transform([cleaned])[0]
        cleaned = re.sub(all_dates, '<date> ', cleaned)
        cleaned = re.sub(special_symbols,'',cleaned)
        cleaned = re.sub(multiple_chars, '', cleaned)
        data.at[i, col_name] = cleaned

def clean_data(data):
    clean_column(data, "content")
    clean_column(data, "title")
    return data


In [304]:
data["content"][4]

'Donald Trump has the unnerving ability to ability to create his own reality and convince millions of Americans that what he says it is true. The problem with the president lying is that he then believes his own lies. A new poll shows how that can get the country into deep trouble.\n\nThe new ABC News/Washington Post poll came out after the president’s physician gave him a physical and mental exam. The doctor gave Trump a clean bill of health, added an inch to his height, and claimed he was fit to serve for seven more years.\n\nThis poll was able to capture Americans’ opinions after a new book came out indicating that people around Trump questioned his emotional stability and ability to hold office. In addition, the new poll gave the respondents the time to hear Trump tell the public that he was a “very stable genius” before they were interviewed. He said:\n\n‘Actually, throughout my life, my two greatest assets have been mental stability and being, like, really smart.’\n\nThe ABC/Wash

Since we are working on a subset of the full dataset, there is no need to include the old index/ID.
Furthermore, since pandas adds an index column, we do not have a need for the already existing (possibly error prone) local index (Unnamed: 0).

Metadata regarding scraping, time of updates etc. do not serve a significant impact on the processing we wish to perform.

In [305]:
pd.set_option('display.max_rows', None)
data = clean_data(data)
df = pd.DataFrame(data, columns=['domain','type','url','content','title','authors', 'meta_keywords', 'meta_description', 'tags'])
df = df.sort_values(by=['type'])
df
#print(WordFreqSet("content", df))

TypeError: expected string or bytes-like object

In [None]:
print(data["title"][4])

trump s genius poll is complete the results have americans bursting with laughter


In [None]:
data["content"][4]

'donald trump has the unnerving ability to ability to create his own reality and convince millions of americans that what he says it is true the problem with the president lying is that he then believes his own lies a new poll shows how that can get the country into deep trouble the new abc news washington post poll came out after the president s physician gave him a physical and mental exam the doctor gave trump a clean bill of health added an inch to his height and claimed he was fit to serve for seven more years this poll was able to capture americans opinions after a new book came out indicating that people around trump questioned his emotional stability and ability to hold office in addition the new poll gave the respondents the time to hear trump tell the public that he was a very stable genius before they were interviewed he said actually throughout my life my two greatest assets have been mental stability and being like really smart the abc washington post poll discovered that 

In [None]:
# Task 3
# things to explore
# To be fake news it has to not have meta keywords or description
# All fake news originates from beforeitsnews.com	
# 1: clickbait type vs titles
# 2: word frequency n-gram vs article type
# 3: authors changing type

In [None]:
### Task 4
# Initialize Group SubString
group_nr = 14
group_substring_raw = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
group_substring = ""

for letter in np.sort(list(group_substring_raw)):
    group_substring += letter
    
print(group_substring)

AOPRSTUVWZ


In [None]:
import requests
from bs4 import BeautifulSoup

# Get main page and add subpage (according to group_substring) urls to list
response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text

soup = BeautifulSoup(contents, 'html.parser')

subpages = []
for a in soup.find_all('a', href=True):
    for letter in group_substring:
        if "conflicts&from="+letter in a["href"]:
            if not a["href"] in subpages:
                subpages.append(a["href"])  

ModuleNotFoundError: No module named 'bs4'

In [None]:
# Creating a list of articles on THE FIRST PAGE ONLY, NEED TO FIX LATER
Articles = []
for url in subpages:
    response = requests.get(url)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')

    allGroups = soup.find_all("div",attrs={"class":"mw-category-group"})
    for n in allGroups:
        if "<h3>"+url[-1]+"</h3>" in str(n) and "<ul><li><a" in str(n):
            pages = n
            break
    ul = re.findall('\/wiki.*(?=title)',str(pages))
    for i in range(len(ul)):
        ul[i] = "https://en.wikinews.org" + ul[i][:-2]

    Articles.append(ul)

In [None]:
def GrabArticle(url):
    response = requests.get(url)
    contents = response.text

    soup = BeautifulSoup(contents, 'html.parser')
    
    try: # Need to convert from "MonthName Day, Year" to "Year-Month-Day"
        date = re.findall('[1-9]+.*[1-9]',str(soup.find("span", attrs={"id":"publishDate"})))[0]
    except:
        try: 
            date = re.findall('[A-Z][a-z]+ [0-9]+, [0-9][0-9][0-9][0-9]',str(soup.find("div", attrs={"class":"mw-parser-output"})))[0]
        except:
            date = "NaN"
    try:
        text = soup.get_text() #currently displays EVERYTHING on page, needs work
    except:
        text = "NaN"

    
    #srcs section needs work
    srcs = []
    try:
        src = soup.find_all("span",attrs={"class":"sourceTemplate"})
        for n in src:
            srcs.append(n)
    except:
        src = "NaN"

    return date#,srcs,text

In [None]:
# Scraping article pages for data and adding to lists for dataframe creation
urls = []
dates = []
for articles in Articles:
    for url in articles:
        urls.append(url)
        dates.append(GrabArticle(url))
        
        
Task4df = pd.DataFrame(data = {"URL" : urls, "Date written" : dates})

In [None]:
Task4df

Unnamed: 0,URL,Date written
0,https://en.wikinews.org/wiki/A_1-year_long_str...,2007-09-17
1,https://en.wikinews.org/wiki/A_policeman_is_ki...,2005-02-16
2,https://en.wikinews.org/wiki/A_timeline:_Novak...,2005-07-15
3,https://en.wikinews.org/wiki/Abbas_and_Olmert_...,2008-01-08
4,https://en.wikinews.org/wiki/Abbas_fires_secur...,2005-04-02
5,https://en.wikinews.org/wiki/Abbas_orders_more...,2005-04-02
6,"https://en.wikinews.org/wiki/Abbas,_Sharon_dec...",2005-02-08
7,https://en.wikinews.org/wiki/Abbott_calls_for_...,2007-10-02
8,https://en.wikinews.org/wiki/Abbott_labs_ends_...,2005-07-11
9,https://en.wikinews.org/wiki/Abbott_open_to_po...,2014-06-15
