In [1]:
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/My Drive/NLP/'

Mounted at /content/drive


In [2]:
%%bash
# Logistics #2: install the transformers package, create a folder, download the dataset and a patch
pip -q install transformers
pip -q install datasets

# remove the directory if necessary
# rm -rf "/content/gdrive/MyDrive/6864_hw4/"

# mkdir "/content/drive/My Drive/NLP/"
cd "/content/drive/My Drive/NLP/"

In [5]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased')

import torch
import torch.nn as nn
class FakeNewsClassifier(nn.Module):
  def __init__(self, lm):
    super(FakeNewsClassifier, self).__init__()
    self.lm = lm
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask=None):
    lm_output = self.lm(input_ids, attention_mask)
    logits = lm_output.logits
    pred = self.sigmoid(logits)
    return pred

config = transformers.DistilBertConfig(num_labels=1)
distilbert = transformers.DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
model = FakeNewsClassifier(distilbert)
model = model.cuda()
model.load_state_dict(torch.load(project_path + "distilbert/model_embed_liar.pt", map_location=torch.device('cpu')))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

<All keys matched successfully>

In [24]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')

import torch
import torch.nn as nn
class FakeNewsClassifier(nn.Module):
  def __init__(self, lm, dropout=0.2):
    super(FakeNewsClassifier, self).__init__()
    self.lm = lm
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(lm.config.hidden_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask=None):
    lm_output = self.lm(input_ids, attention_mask)
    pooled = lm_output.pooler_output
    pooled = self.dropout(pooled)
    lin_output = self.linear(pooled)
    pred = self.sigmoid(lin_output)
    return pred

roberta = transformers.AutoModel.from_pretrained('roberta-base')
model = FakeNewsClassifier(roberta)
model = model.cuda()
model.load_state_dict(torch.load(project_path + "roberta/model_embed_liar.pt", map_location=torch.device('cpu')))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [25]:
def decode(pred):
  decoded = torch.where(pred >= 0.5, 1, 0)
  return decoded

import numpy as np
def tokenize_batch_farn(batch):
  x = list(batch['text'])
  y = np.array(batch['label'])
  
  x_encode = tokenizer.batch_encode_plus(
      x,
      max_length=512,
      # max_length=len(x[0]),
      truncation=True,
      padding='longest',
      return_attention_mask=True,
      return_tensors='pt'
  )
  input_ids = x_encode['input_ids'].cuda()
  input_mask = x_encode['attention_mask'].cuda()

  return input_ids, input_mask, y

In [26]:
def chunk_input_512(input):
  n = 512
  out = [input[i:i+n] for i in range(0, len(input), n)]
  out[-1] = torch.cat([out[-1], torch.zeros(512 - len(out[-1])).cuda()])
  return out
  
def decode_chunk_preds_avg(preds):
  return preds.mean()

def decode_chunk_preds_greedy(preds):
  return torch.min(preds)

In [27]:
# Famous examples of fake news
spiders_article_text = '''Rod Crawford has heard plenty of firsthand accounts of spider-swilling slumberers. “Once or twice a year, someone tells me they once recovered a spider leg in their mouth,” says Crawford, the arachnid curator at the Burke Museum of Natural History and Culture in Seattle. Luckily for all of us, the “fact” that people swallow eight spiders in their sleep yearly isn’t true. Not even close. The myth flies in the face of both spider and human biology, which makes it highly unlikely that a spider would ever end up in your mouth. Three or four spider species live in most North American homes, and they all tend to be found either tending their webs or hunting in nonhuman-infested areas. During their forays, they usually don’t intentionally crawl into a bed because it offers no prey (unless it has bed bugs, in which case that person has bigger problems). Spiders also have no interest in humans. “Spiders regard us much like they’d regard a big rock,” says Bill Shear, a biology professor at Hampden–Sydney College in Virginia and former president of the American Arachnological Society. “We’re so large that we’re really just part of the landscape,” More than anything, spiders probably find sleeping humans terrifying. A slumbering person breathes, has a beating heart and perhaps snores—all of which create vibrations that warn spiders of danger. “Vibrations are a big slice of spiders’ sensory universe,” Crawford explains, “A sleeping person is not something a spider would willingly approach.” From the standpoint of human biology, the oral spider myth also seems ridiculous. If someone is sleeping with her mouth open, she’s probably snoring—and thus scaring off any eight-legged transgressors. Plus, many people would likely be awakened by the sensation of a spider crawling over their faces and into their mouths. Shear can attest: once, while camping, he awoke to find a daddy longlegs crawling on his face. Spider experts concede that a sleeping person could plausibly swallow a spider, but “it would be a strictly random event.” People who claim they’ve swallowed spiders never seem to have any concrete evidence. “People tell me this happened to them, but they threw it (the evidence) away—flushed it down the toilet, usually,” Crawford says. There’s also a sore lack of eyewitnesses for such a frequent event as eight spiders a year. So even if you heard or read this spider statistic from a trustworthy source (such as a Snapple cap), you can rest assured that it doesn’t have a leg, or eight legs, to stand on.'''
spiders_statement_fake_text = 'people swallow eight spiders in their sleep yearly'

peanutbutter_article_fake_text = '''If you’re reading this, I’ll assume you’ve had a Reese’s peanut butter cup at least once in your lifetime. I mean, who hasn’t? Not only are they available year round, they also come in fun shapes around the holidays. Do Reese’s trees and Reese’s eggs sound familiar? If you’re a fan of the popular peanut butter cups, your mouth is probably salivating right about now. Unfortunately, a deeper look into Reese’s ingredients might make you question that last minute purchase at the checkout line. As delicious as they are, Reese’s peanut butter cups can be detrimental to your health. Reese’s peanut butter cups were invented in 1928 by Mr. Reese. He was a farmer and a shipping foreman for Milton S. Hershey. After inventing the sweet treat, Mr. Reese decided to quit the dairy farming business and start his own candy company in his basement. And the rest is history. Reese’s peanut butter cups come in many different shapes, sizes and varieties. Although the chocolate to peanut butter ratio seems like perfection, the other ingredients in the popular candy are cause for concern. Ingredients include: Milk chocolate, (milk, chocolate, sugar, cocoa butter, chocolate, no fat milk, milk fat, lactose, soy lecithin, PGPR), peanuts, sugar dextrose, salt, TBGQ and citric acid. The most questionable ingredients are: 1. Soy Lecithin Research has shown that as much as 93% of soy is genetically modified. Soy lecithin has been found to have detrimental effects on fertility and reproduction. It can cause behavioral and cerebral abnormalities. It has also been linked to breast cancer. 2. PGPR PGPR is short for polyglycerol polyricinoleate. The manufacturer of this popular candy replaced cocoa butter with PGPR to lower the cost of production. PGPR comes from castor beans and it’s used to reduce the viscosity of chocolate. It has been connected to gastrointestinal problems and allergic reactions in children. 3. TBHQ TBHQ stands for tertiary butylhdroquinone. It’s derived from petroleum and can be extremely toxic. Side effects of ingesting TBHQ include nausea, vomiting, ringing in the ears, delirium and collapse. Research has shown that TBHQ can damage the lungs and umbilical cells in humans. It can also cause stomach cancer. Children who are exposed to this chemical may show anxiety, restlessness and intensified ADHD symptoms.'''

pizzagate_article_fake_text = '''New York Police Department detectives and prosecutors working an alleged underage sexting case against former Congressman Anthony Weiner have turned over a newly-found laptop he shared with wife Huma Abedin to the FBI with enough evidence “to put Hillary (Clinton) and her crew away for life,” NYPD sources told True Pundit. NYPD sources said Clinton’s “crew” also included several unnamed yet implicated members of Congress in addition to her aides and insiders. The NYPD seized the computer from Weiner during a search warrant and detectives discovered a trove of over 500,000 emails to and from Hillary Clinton, Abedin and other insiders during her tenure as secretary of state. The content of those emails sparked the FBI to reopen its defunct email investigation into Clinton on Friday. But new revelations on the contents of that laptop, according to law enforcement sources, implicate the Democratic presidential candidate, her subordinates, and even select elected officials in far more alleged serious crimes than mishandling classified and top secret emails, sources said. NYPD sources said these new emails include evidence linking Clinton herself and associates to: Money laundering Child exploitation Sex crimes with minors (children) Perjury Pay to play through Clinton Foundation Obstruction of justice Other felony crimes NYPD detectives and a NYPD Chief, the department’s highest rank under Commissioner, said openly that if the FBI and Justice Department fail to garner timely indictments against Clinton and co- conspirators, NYPD will go public with the damaging emails now in the hands of FBI Director James Comey and many FBI field offices. “What’s in the emails is staggering and as a father, it turned my stomach,” the NYPD Chief said. “There is not going to be any Houdini-like escape from what we found. We have copies of everything. We will ship them to Wikileaks or I will personally hold my own press conference if it comes to that.” The NYPD Chief said once Comey saw the alarming contents of the emails he was forced to reopen a criminal probe against Clinton. “People are going to prison,” he said. Meanwhile, FBI sources said Abedin and Weiner were cooperating with federal agents, who have taken over the non-sexting portions the case from NYPD. The husband-and-wife Clinton insiders  are both shopping for separate immunity deals, sources said. “If they don’t cooperate they are going to see long sentences,” a federal law enforcement source said. NYPD sources said Weiner or Abedin stored all the emails in a massive Microsoft Outlook program on the laptop. The emails implicate other current and former members of Congress and one high-ranking Democratic Senator as having possibly engaged in criminal activity too, sources said. Prosecutors in the office of US Attorney Preet Bharara have issued a subpoena for Weiner’s cell phones and travel records, law enforcement sources confirmed. NYPD said it planned to order the same phone and travel records on Clinton and Abedin, however, the FBI said it was in the process of requesting the identical records. Law enforcement sources are particularly interested in cell phone activity and travel to the Bahamas, U.S. Virgin Islands and other locations that sources would not divulge. The new emails contain travel documents and itineraries indicating Hillary Clinton, President Bill Clinton, Weiner and multiple members of Congress and other government officials accompanied convicted pedophile billionaire Jeffrey Epstein on his Boeing 727 on multiple occasions to his private island in the U.S Virgin Islands, sources said. Epstein’s island has also been dubbed Orgy Island or Sex Slave Island where Epstein allegedly pimps out underage girls and boys to international dignitaries. Both NYPD and FBI sources confirm based on the new emails they now believe Hillary Clinton traveled as Epstein’s guest on at least six occasions, probably more when all the evidence is combed, sources said. Bill Clinton, it has been confirmed in media reports spanning recent years, that he too traveled with Epstein over 20 times to the island.'''

pizzagate_article_real_text = '''Incoming White House national security adviser Michael Flynn quietly deleted a tweet in which he shared a fake news story linking Democratic presidential nominee Hillary Clinton to sex crimes with minors, CNN reported. "U decide - NYPD Blows Whistle on New Hillary Emails: Money Laundering, Sex Crimes w Children, etc...MUST READ!" read the tweet, which has since been deleted. The tweet came on Nov. 2, just six days before the presidential election. It can still be viewed in the internet archive. The tweet linked to an article in True Pundit, which falsely claimed that New York police had found evidence linking Clinton and her senior staff to crimes involving underage sex rings. The article charged that emails discovered during the investigation into Anthony Weiner uncovered evidence "to put Hillary (Clinton) and her crew away for life." Flynn, who was a staunch supporter of President-elect Donald Trump during the campaign, has previously faced backlash for pushing unsubstantiated stories. His tweets came to light again earlier this month after an armed man entered a pizza shop in Washington, D.C. and claimed to be investigating a story that Clinton and her campaign chief ran a child sex room in the back room. In the past, Flynn has also retweeted theories that Clinton "waged war" on the Roman Catholic Church and that President Obama "laundered" billions of dollars in cash to terrorists.'''

onion_modern_text = '''NEW YORK—Accusing educators of brainwashing children with a liberal agenda of barnyard sounds, conservative pundits criticized local preschool Butler Academy Monday for silencing right-wing animal voices. “Every day, our children get sent to schools just like this, and are brainwashed by antifa teachers who believe that cows, chickens, and pigs only speak one way,” said Senator Ted Cruz (R-TX), who held up a picture book, flipped through the pages, and demanded to know why there wasn’t a single conservative “moo,” “baa,” or “oink” to be found. “All I see here is a bunch of chickens who are clearly part of the East Coast elite, clucking and teaching our children that the only way to make farm noises is to go to Harvard. Well, to that I say this is how real American animals talk: cock-a-doodle-doo.” At press time, conservative pundits had called a teacher from the preschool in order to shame her for only teaching left-wing shapes.'''

onion_2016_text = '''NEW YORK—Telling reporters she likes to be kept on her toes, Donald Trump surrogate Kayleigh McEnany said Thursday she has been enjoying the thrill of never knowing what comment or behavior she is going to be defending minute to minute. “Every time I sit down for an interview on cable news or a radio call-in show, I really have no clue whether I’ll need to offer an impassioned defense of disparaging remarks Mr. Trump made about a specific woman’s physical attributes, a blanket statement about an entire ethnic group, or a speech calling for an unconstitutional immigration policy—it really keeps things exciting,” said McEnany, explaining that she was thankful her job gave her such a wide variety of inflammatory statements to spin as positives and didn’t simply consist of walking back the same stale scandals day after day. “The possibilities are truly endless in this position. I could be on Anderson Cooper 360° defusing Mr. Trump’s latest impulsive remarks on anything from nuclear proliferation to criminal justice, and then just an hour or two later on CNN Tonight, I might need to explain away a series of antagonistic and barely comprehensible tweets targeting a particular journalist or politician that I read right before going on-air. Every day is a new adventure.” McEnany added that despite the challenges of her role, it was comforting to know she could respond to any question by claiming that Hillary Clinton is deathly ill.'''

import pandas as pd

examples_dataset = pd.DataFrame()

examples_dataset['text'] = np.array([spiders_article_text, spiders_statement_fake_text, peanutbutter_article_fake_text, pizzagate_article_fake_text, pizzagate_article_real_text, onion_modern_text, onion_2016_text])
examples_dataset['label'] = np.array([1, 0, 0, 0, 1, 0, 0])


In [28]:
farn = pd.read_csv(project_path + 'fake_and_real_news/combined.csv')
training_size = int(.6*farn.shape[0])
validation_size = int(.2*farn.shape[0])
testing_size = int(.2*farn.shape[0])

# data_train = dataset[:training_size]
# data_val = dataset[training_size:training_size+validation_size]
data_test_farn = farn[training_size+validation_size:]


from datasets import load_dataset
dataset = load_dataset('liar')

def bin_labels(dataset):
  mapping = {
      0: 'false',
      1: 'half-true',
      2: 'mostly-true',
      3: 'true',
      4: 'barely-true',
      5: 'pants-fire'
  }
  texts = np.array(dataset['statement'])
  labels = np.array(dataset['label'])
  true = labels == 3
  mostly_true = labels == 2
  pos_labels = np.logical_or(true, mostly_true)
  labels = np.where(pos_labels, 1, 0)

  new_dataset = pd.DataFrame()
  new_dataset['text'] = texts
  new_dataset['label'] = labels
  return new_dataset

# data_train = bin_labels(dataset['train'])
# data_val = bin_labels(dataset['validation'])
data_test_liar = bin_labels(dataset['test'])





Using custom data configuration default
Reusing dataset liar (/root/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514)


In [29]:


def get_chunk_data(test_sample):
  avg_pred = []
  greedy_pred = []
  for i in range(0, len(test_sample)):

    data_point = test_sample.iloc[i:i+1]

    chunk_512_input_ids, chunk_512_input_mask, label = tokenize_batch_farn(data_point)

    chunk_512_input_ids = chunk_input_512(chunk_512_input_ids[0])
    chunk_512_input_ids = torch.stack(chunk_512_input_ids).to(dtype=torch.int)
    # print(chunk_512_input_ids.shape)

    chunk_512_input_mask = chunk_input_512(chunk_512_input_mask[0])
    chunk_512_input_mask = torch.stack(chunk_512_input_mask).to(dtype=torch.int)

    preds = model.forward(chunk_512_input_ids, chunk_512_input_mask)

    label = int(label)


    avg_decoded = decode_chunk_preds_avg(preds)
    avg_decoded = decode(avg_decoded).to(dtype=torch.float32)

    greedy_decoded = decode_chunk_preds_greedy(preds)
    greedy_decoded = decode(greedy_decoded).to(dtype=torch.float32)

    avg_pred.append(avg_decoded)
    greedy_pred.append(greedy_decoded)

  return torch.tensor(avg_pred).to(dtype=torch.int), torch.tensor(greedy_pred).to(dtype=torch.int), test_sample['label']



In [30]:


avg_res, greedy_res, labels = get_chunk_data(data_test_farn)

labels = torch.tensor([l for l in labels]).to(dtype=torch.int)

# print(results.shape, labels.shape)
# results = results.reshape(-1)
# labels = labels.reshape(-1)
# print(results.shape, labels.shape)

from sklearn.metrics import precision_score, recall_score, f1_score

def print_metrics(results, labels):
  # print(results)
  # print(labels)
  accuracy = np.where(results == labels, 1, 0).sum() / len(labels)
  prec = precision_score(labels, results)
  recall = recall_score(labels, results)
  f1 = f1_score(labels, results)

  print("Acc:", accuracy, "Prec:", prec, "Recall:", recall, "F1:", f1)

print('Avg:')
print_metrics(avg_res, labels)
print('Min:')
print_metrics(greedy_res, labels)

# true_pos = torch.logical_and([results, labels])
# false_pos = torch.logical_and([results, torch.logical_not(labels)])
# false_neg = torch.logical_and([labels, torch.logical_not(results)])

# precision = true_pos.sum() / (true_pos.sum() + false_pos.sum())
# recall = true_pos.sum() / (true_pos.sum() + false_neg.sum())

Avg:
Acc: 0.6858924395946999 Prec: 0.6132398221005816 Recall: 0.864897466827503 F1: 0.7176458812931639
Min:
Acc: 0.6858924395946999 Prec: 0.6132398221005816 Recall: 0.864897466827503 F1: 0.7176458812931639
