In [None]:
import math
import os
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from torchtext.vocab import GloVe

import pickle

basepath = '.'

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

with open(os.path.join(basepath, 'train_val_data.pkl'), 'rb') as f:
  train_data, val_data = pickle.load(f)

print('Number of train examples:', len(train_data))
print('Number of val examples:', len(val_data))

In [None]:
def get_description_from_html(html):
  soup = bs(html)
  description_tag = soup.find('meta', attrs={'name':'og:description'}) or soup.find('meta', attrs={'property':'description'}) or soup.find('meta', attrs={'name':'description'})
  if description_tag:
    description = description_tag.get('content') or ''
  else: # If there is no description, return empty string.
    description = ''
  return description

def scrape_description(url):
  if not url.startswith('http'):
    url = 'http://' + url
  response = requests.get(url, timeout=10)
  html = response.text
  description = get_description_from_html(html)
  return description

print('Description of Google.com:')
print(scrape_description('google.com'))

In [None]:
url = "youtube.com"
print('Description of %s:' % url)
print(scrape_description(url))

In [None]:
def get_descriptions_from_data(data):
  # A dictionary mapping from url to description for the websites in
  # train_data.
  descriptions = []
  for site in tqdm(data):
    url, html, label = site
    descriptions.append(get_description_from_html(html))
  return descriptions


train_descriptions = get_descriptions_from_data(train_data)
train_urls = [url for (url, html, label) in train_data]

print('\nNYTimes Description:')
print(train_descriptions[train_urls.index('nytimes.com')])

In [None]:
val_descriptions = get_descriptions_from_data(val_data)

In [None]:
vectorizer = CountVectorizer(max_features=300)

vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(descriptions, vectorizer):
  X = vectorizer.transform(descriptions).todense()
  return X

print('\nPreparing train data...')
bow_train_X = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_train_y = [label for url, html, label in train_data]

print('\nPreparing val data...')
bow_val_X = vectorize_data_descriptions(val_descriptions, vectorizer)
bow_val_y = [label for url, html, label in val_data]


In [None]:
model = LogisticRegression()

model.fit(bow_train_X, bow_train_y)

bow_train_y_pred = model.predict(bow_train_X)
print('Train accuracy', accuracy_score(bow_train_y, bow_train_y_pred))

bow_val_y_pred = model.predict(bow_val_X)
print('Val accuracy', accuracy_score(bow_val_y, bow_val_y_pred))

print('Confusion matrix:')
print(confusion_matrix(bow_val_y, bow_val_y_pred))

prf = precision_recall_fscore_support(bow_val_y, bow_val_y_pred)

print('Precision:', prf[0][1])
print('Recall:', prf[1][1])
print('F-Score:', prf[2][1])

In [None]:
VEC_SIZE = 300
glove = GloVe(name='6B', dim=VEC_SIZE)

# Returns word vector for word if it exists, else return None.
def get_word_vector(word):
    try:
      return glove.vectors[glove.stoi[word.lower()]].numpy()
    except KeyError:
      return None

In [None]:
good_vector = get_word_vector('good')
print('Shape of good vector:', good_vector.shape)
print(good_vector)

In [None]:
def cosine_similarity(vec1, vec2):
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

word1 = "good"
word2 = "great"

print('Word 1:', word1)
print('Word 2:', word2)

def cosine_similarity_of_words(word1, word2):
  vec1 = get_word_vector(word1)
  vec2 = get_word_vector(word2)

  if vec1 is None:
    print(word1, 'is not a valid word. Try another.')
  if vec2 is None:
    print(word2, 'is not a valid word. Try another.')
  if vec1 is None or vec2 is None:
    return None

  return cosine_similarity(vec1, vec2)


print('\nCosine similarity:', cosine_similarity_of_words(word1, word2))


In [None]:
def glove_transform_data_descriptions(descriptions):
    X = np.zeros((len(descriptions), VEC_SIZE))
    for i, description in enumerate(descriptions):
        found_words = 0.0
        description = description.strip()
        for word in description.split():
            vec = get_word_vector(word)
            if vec is not None:
                # Increment found_words and add vec to X[i].
                found_words += 1
                X[i] += vec
        if found_words > 0:
            X[i] /= found_words

    return X

glove_train_X = glove_transform_data_descriptions(train_descriptions)
glove_train_y = [label for (url, html, label) in train_data]

glove_val_X = glove_transform_data_descriptions(val_descriptions)
glove_val_y = [label for (url, html, label) in val_data]

In [None]:
model = LogisticRegression()
model.fit(glove_train_X, glove_train_y)

glove_train_y_pred = model.predict(glove_train_X)
print('Train accuracy', accuracy_score(glove_train_y, glove_train_y_pred))

glove_val_y_pred = model.predict(glove_val_X)
print('Val accuracy', accuracy_score(glove_val_y, glove_val_y_pred))

print('Confusion matrix:')
print(confusion_matrix(glove_val_y, glove_val_y_pred))

prf = precision_recall_fscore_support(glove_val_y, glove_val_y_pred)

print('Precision:', prf[0][1])
print('Recall:', prf[1][1])
print('F-Score:', prf[2][1])

In [None]:
def train_model(train_X, train_y, val_X, val_y):
  model = LogisticRegression(solver='liblinear')
  model.fit(train_X, train_y)

  return model


def train_and_evaluate_model(train_X, train_y, val_X, val_y):
  model = train_model(train_X, train_y, val_X, val_y)

  train_y_pred = model.predict(train_X)
  print('Train accuracy', accuracy_score(train_y, train_y_pred))

  val_y_pred = model.predict(val_X)
  print('Val accuracy', accuracy_score(val_y, val_y_pred))

  print('Confusion matrix:')
  print(confusion_matrix(val_y, val_y_pred))

  prf = precision_recall_fscore_support(val_y, val_y_pred)

  print('Precision:', prf[0][1])
  print('Recall:', prf[1][1])
  print('F-Score:', prf[2][1])

  return model

In [None]:
def prepare_data(data, featurizer):
    X = []
    y = []
    for datapoint in data:
        url, html, label = datapoint

        html = html.lower()
        y.append(label)

        features = featurizer(url, html)
        feature_descriptions, feature_values = zip(*features.items())

        X.append(feature_values)

    return X, y, feature_descriptions

# Gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword
# to lowercase).
def get_normalized_count(html, phrase):
    return math.log(1 + html.count(phrase.lower()))

# Returns a dictionary mapping from plaintext feature descriptions to numerical
# features for a (url, html) pair.
def keyword_featurizer(url, html):
    features = {}

    # Same as before.
    features['.com domain'] = url.endswith('.com')
    features['.org domain'] = url.endswith('.org')
    features['.net domain'] = url.endswith('.net')
    features['.info domain'] = url.endswith('.info')
    features['.org domain'] = url.endswith('.org')
    features['.biz domain'] = url.endswith('.biz')
    features['.ru domain'] = url.endswith('.ru')
    features['.co.uk domain'] = url.endswith('.co.uk')
    features['.co domain'] = url.endswith('.co')
    features['.tv domain'] = url.endswith('.tv')
    features['.news domain'] = url.endswith('.news')

    keywords = ['trump', 'biden', 'clinton', 'sports', 'finance']

    for keyword in keywords:
      features[keyword + ' keyword'] = get_normalized_count(html, keyword)

    return features

keyword_train_X, train_y, _ = prepare_data(train_data, keyword_featurizer)
keyword_val_X, val_y, _ = prepare_data(val_data, keyword_featurizer)

train_and_evaluate_model(keyword_train_X, train_y, keyword_val_X, val_y)


In [None]:
vectorizer = CountVectorizer(max_features=300)

vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(data_descriptions, vectorizer):
  X = vectorizer.transform(data_descriptions).todense()
  return X

bow_train_X = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_val_X = vectorize_data_descriptions(val_descriptions, vectorizer)

train_and_evaluate_model(bow_train_X, train_y, bow_val_X, val_y)

In [None]:
VEC_SIZE = 300
glove = GloVe(name='6B', dim=VEC_SIZE)

# Returns word vector for word if it exists, else return None.
def get_word_vector(word):
    try:
      return glove.vectors[glove.stoi[word.lower()]].numpy()
    except KeyError:
      return None

def glove_transform_data_descriptions(descriptions):
    X = np.zeros((len(descriptions), VEC_SIZE))
    for i, description in enumerate(descriptions):
        found_words = 0.0
        description = description.strip()
        for word in description.split():
            vec = get_word_vector(word)
            if vec is not None:
                # Increment found_words and add vec to X[i].
                found_words += 1
                X[i] += vec
        if found_words > 0:
            X[i] /= found_words

    return X


In [None]:
glove_train_X = glove_transform_data_descriptions(train_descriptions)
glove_val_X = glove_transform_data_descriptions(val_descriptions)

train_and_evaluate_model(glove_train_X, train_y, glove_val_X, val_y)

In [None]:
def combine_features(X_list):
  return np.concatenate(X_list, axis=1)

combined_train_X = combine_features([keyword_train_X, bow_train_X, glove_train_X])
combined_val_X = combine_features([keyword_val_X, bow_val_X, glove_val_X])

model = train_and_evaluate_model(combined_train_X, train_y, combined_val_X, val_y)

In [None]:
def get_data_pair(url):
  if not url.startswith('http'):
      url = 'http://' + url
  url_pretty = url
  if url_pretty.startswith('http://'):
      url_pretty = url_pretty[7:]
  if url_pretty.startswith('https://'):
      url_pretty = url_pretty[8:]

  # Scrape website for HTML
  response = requests.get(url, timeout=10)
  htmltext = response.text

  return url_pretty, htmltext

curr_url = "www.yahoo.com"

url, html = get_data_pair(curr_url)

def dict_to_features(features_dict):
  X = np.array(list(features_dict.values())).astype('float')
  X = X[np.newaxis, :]
  return X
def featurize_data_pair(url, html):
  # Approach 1.
  keyword_X = dict_to_features(keyword_featurizer(url, html))
  # Approach 2.
  description = get_description_from_html(html)

  bow_X = vectorize_data_descriptions([description], vectorizer)

  # Approach 3.
  glove_X = glove_transform_data_descriptions([description])

  X = combine_features([keyword_X, bow_X, glove_X])

  return X

curr_X = featurize_data_pair(url, html)

model = train_model(combined_train_X, train_y, combined_val_X, val_y)

curr_y = model.predict(curr_X)[0]


if curr_y < .5:
  print(curr_url, 'appears to be real.')
else:
  print(curr_url, 'appears to be fake.')

In [None]:

with open(os.path.join(basepath, 'test_data.pkl'), 'rb') as f:
  test_data = pickle.load(f)
print('Number of test examples:', len(test_data))



model = train_model(combined_train_X, train_y, combined_val_X, val_y)

print('Loading test data...')
test_X = []
for url, html, label in test_data:
  curr_X = np.array(featurize_data_pair(url, html))
  test_X.append(curr_X[0])

test_X = np.array(test_X)

test_y = [label for url, html, label in test_data]

print('Done loading test data...')

test_y_pred = model.predict(test_X)

print('Test accuracy', accuracy_score(test_y, test_y_pred))

print('Confusion matrix:')
print(confusion_matrix(test_y, test_y_pred))

prf = precision_recall_fscore_support(test_y, test_y_pred)

print('Precision:', prf[0][1])
print('Recall:', prf[1][1])
print('F-Score:', prf[2][1])