In [None]:
import os
from bs4 import BeautifulSoup as bs
import pickle

basepath = '.'

In [None]:
with open(os.path.join(basepath, 'train_val_data.pkl'), 'rb') as f:
  train_data, val_data = pickle.load(f)

print('Number of train examples:', len(train_data))
print('Number of val examples:', len(val_data))

print('Fraction of train examples that are fake:', len([datapoint for datapoint in train_data if datapoint[2] == 0]) / float(len(train_data)))
print('Fraction of val examples that are fake:', len([datapoint for datapoint in val_data if datapoint[2] == 0]) / float(len(val_data)))

In [None]:
example_idx = 18

In [None]:
print('Number of values per data point: %d\n' % len(train_data[0]))

print('URL for chosen example:', train_data[example_idx][0])
print('Label for chosen example:', train_data[example_idx][2])
print('HTML for chosen example (first 5000 chars):\n\n', bs(train_data[example_idx][1]).prettify()[:1000])


In [None]:
def get_real_and_fake_fractions(train_data, hypothesis):
    # Label 0, hypothesis true
    real_true = 0.0
    # Label 0 total
    real_total = 0.0
    # Label 1, hypothesis true
    fake_true = 0.0
    # Label 1 total
    fake_total = 0.0

    for datapoint in train_data:
        # Each datapoint has URL, HTML, label in that order.
        label = datapoint[2]
        hypothesis_truth = int(hypothesis(datapoint))
        if label: # Fake
            fake_total += 1
            fake_true += hypothesis_truth
        else: # Real
            real_total += 1
            real_true += hypothesis_truth

    return real_true / real_total, fake_true / fake_total

In [None]:
def domain_extension_hypothesis(datapoint):
  extension = ".com" #@param {type:"string"}
  url = datapoint[0]
  return url.endswith(extension)

real_fraction, fake_fraction = get_real_and_fake_fractions(train_data,
                                                           domain_extension_hypothesis)

print('Real fraction:', real_fraction)
print('Fake fraction:', fake_fraction)

# Simple logic for making the printed ratio more interpretable.
def pretty_ratio(fake_fraction, real_fraction):
    ratio = (fake_fraction / real_fraction) if real_fraction > 0 else 'Infinity'
    if fake_fraction == real_fraction:
      ratio = 1
    return ratio

print('Ratio fraction:', pretty_ratio(fake_fraction, real_fraction))

In [None]:
domain_name_extension_with_ratio_infinity = ''
domain_name_extension_with_ratio_zero = ''

In [None]:
def get_count_from_html(html, hypothesis_word):
    # Transform word to lowercase for consistent results.
    return html.count(hypothesis_word.lower())

def word_threshold_hypothesis(datapoint):
  hypothesis_word = "Hilary"
  threshold = 3
  # Transform HTML to lowercase for consistent results.
  html = datapoint[1].lower()
  count = get_count_from_html(html, hypothesis_word)
  return count > threshold

real_fraction, fake_fraction = get_real_and_fake_fractions(train_data,
                                                           word_threshold_hypothesis)

print('Real fraction:', real_fraction)
print('Fake fraction:', fake_fraction)

print('Ratio fraction:', pretty_ratio(fake_fraction, real_fraction))