In [None]:
# This code uses the styleformer library to recreate the snopes dataset with formalized text. 
# Change the style_transfer_type integer to choose what kind of style transfer you wish
# 0 for informal to formal, 1 for formal to informal, 2 for active to passive, 3 for passive to active

style_transfer_type = 1

from google.colab import drive
import os
import sys
!pip install transformers
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive')
print(os.getcwd())

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 480 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
!pip install -r /content/drive/MyDrive/styleformer/requirements.txt

In [None]:
class Styleformer():

  def __init__(self,  style=0):
    from transformers import AutoTokenizer
    from transformers import AutoModelForSeq2SeqLM

    self.style = style
    self.adequacy = Adequacy()
    self.model_loaded = False

    if self.style == 0:
      self.ctf_tokenizer = AutoTokenizer.from_pretrained("prithivida/informal_to_formal_styletransfer")
      self.ctf_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/informal_to_formal_styletransfer")
      print("Casual to Formal model loaded...")
      self.model_loaded = True
    elif self.style == 1:
      self.ftc_tokenizer = AutoTokenizer.from_pretrained("prithivida/formal_to_informal_styletransfer")
      self.ftc_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/formal_to_informal_styletransfer")
      print("Formal to Casual model loaded...")
      self.model_loaded = True  
    elif self.style == 2:
      self.atp_tokenizer = AutoTokenizer.from_pretrained("prithivida/active_to_passive_styletransfer")
      self.atp_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/active_to_passive_styletransfer")
      print("Active to Passive model loaded...")  
      self.model_loaded = True
    elif self.style == 3:
      self.pta_tokenizer = AutoTokenizer.from_pretrained("prithivida/passive_to_active_styletransfer")
      self.pta_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/passive_to_active_styletransfer")
      print("Passive to Active model loaded...")        
      self.model_loaded = True
    else:
      print("Only CTF, FTC, ATP and PTA are supported in the pre-release...stay tuned")

  def transfer(self, input_sentence, inference_on=0, quality_filter=0.95, max_candidates=5):
      if self.model_loaded:
        if inference_on == 0:
          device = "cpu"
        elif inference_on == 1:
          device = "cuda:0"  
        else:  
          device = "cpu"
          print("Onnx + Quantisation is not supported in the pre-release...stay tuned.")

        if self.style == 0:
          output_sentence = self._casual_to_formal(input_sentence, device, quality_filter, max_candidates)
          return output_sentence
        elif self.style == 1:
          output_sentence = self._formal_to_casual(input_sentence, device, quality_filter, max_candidates)
          return output_sentence
        elif self.style == 2:
          output_sentence = self._active_to_passive(input_sentence, device)
          return output_sentence        
        elif self.style == 3:
          output_sentence = self._passive_to_active(input_sentence, device)
          return output_sentence           
      else:
        print("Models aren't loaded for this style, please use the right style during init")  


  def _formal_to_casual(self, input_sentence, device, quality_filter, max_candidates):
      ftc_prefix = "transfer Formal to Casual: "
      src_sentence = input_sentence
      input_sentence = ftc_prefix + input_sentence
      input_ids = self.ftc_tokenizer.encode(input_sentence, return_tensors='pt')
      self.ftc_model = self.ftc_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.ftc_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=max_candidates)
     
      gen_sentences = set()
      for pred in preds:
        gen_sentences.add(self.ftc_tokenizer.decode(pred, skip_special_tokens=True).strip())

      adequacy_scored_phrases = self.adequacy.score(src_sentence, list(gen_sentences), quality_filter, device)
      ranked_sentences = sorted(adequacy_scored_phrases.items(), key = lambda x:x[1], reverse=True)
      if len(ranked_sentences) > 0:
        return ranked_sentences[0][0]
      else:
        return None

  def _casual_to_formal(self, input_sentence, device, quality_filter, max_candidates):
      ctf_prefix = "transfer Casual to Formal: "
      src_sentence = input_sentence
      input_sentence = ctf_prefix + input_sentence
      input_ids = self.ctf_tokenizer.encode(input_sentence, return_tensors='pt')
      self.ctf_model = self.ctf_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.ctf_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=max_candidates)
     
      gen_sentences = set()
      for pred in preds:
        gen_sentences.add(self.ctf_tokenizer.decode(pred, skip_special_tokens=True).strip())

      adequacy_scored_phrases = self.adequacy.score(src_sentence, list(gen_sentences), quality_filter, device)
      ranked_sentences = sorted(adequacy_scored_phrases.items(), key = lambda x:x[1], reverse=True)
      if len(ranked_sentences) > 0:
        return ranked_sentences[0][0]
      else:
        return None

  def _active_to_passive(self, input_sentence, device):
      atp_prefix = "transfer Active to Passive: "
      src_sentence = input_sentence
      input_sentence = atp_prefix + input_sentence
      input_ids = self.atp_tokenizer.encode(input_sentence, return_tensors='pt')
      self.atp_model = self.atp_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.atp_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=1)
     
      return self.atp_tokenizer.decode(preds[0], skip_special_tokens=True).strip()

  def _passive_to_active(self, input_sentence, device):
      pta_prefix = "transfer Passive to Active: "
      src_sentence = input_sentence
      input_sentence = pta_prefix + input_sentence
      input_ids = self.pta_tokenizer.encode(input_sentence, return_tensors='pt')
      self.pta_model = self.pta_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.pta_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=1)
     
      return self.pta_tokenizer.decode(preds[0], skip_special_tokens=True).strip()      

    
class Adequacy():

  def __init__(self, model_tag='prithivida/parrot_adequacy_on_BART'):
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    self.nli_model = AutoModelForSequenceClassification.from_pretrained(model_tag)
    self.tokenizer = AutoTokenizer.from_pretrained(model_tag)
    
  def filter(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      top_adequacy_phrases = []
      for para_phrase in para_phrases:
        x = self.tokenizer.encode(input_phrase, para_phrase, return_tensors='pt',truncation_strategy='only_first')
        self.nli_model = self.nli_model.to(device)
        logits = self.nli_model(x.to(device))[0]
        # we throw away "neutral" (dim 1) and take the probability of "entailment" (2) as the adequacy score
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true[0].item()
        if adequacy_score >= adequacy_threshold:
            top_adequacy_phrases.append(para_phrase)
      return top_adequacy_phrases

  def score(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      adequacy_scores = {}
      for para_phrase in para_phrases:
        x = self.tokenizer.encode(input_phrase, para_phrase, return_tensors='pt',truncation_strategy='only_first')
        self.nli_model = self.nli_model.to(device)
        logits = self.nli_model(x.to(device))[0]
        # we throw away "neutral" (dim 1) and take the probability of "entailment" (2) as the adequacy score
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true[0].item()
        if adequacy_score >= adequacy_threshold:
          adequacy_scores[para_phrase] = adequacy_score
      return adequacy_scores      

In [None]:
!nvidia-smi

Mon Dec  6 16:11:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pandas as pd
import os
import torch
import warnings
import transformers
import re
warnings.filterwarnings("ignore")

sf = Styleformer(style = style_transfer_type)

snes_path = "/content/drive/MyDrive/Deep Learning NLP/paper/snes.tsv"
snes_snippets_path = "/content/drive/MyDrive/Deep Learning NLP/paper/snes_snippets.tsv"

snes_col_list = ["claimID", "claim", "label", "claimURL", "reason", "categories", "speaker",
                 "checker", "tags", "articleTitle", "publishDate", "claimDate", "entities"]
df_snes = pd.read_csv(snes_path, sep='\t', names=snes_col_list, index_col=False)

snes_snippets_col_list = ["rank_position", "snippet1", "snippet2", "snippet3", "snippet4", 
                 "snippet5", "snippet6", "snippet7", "snippet8", "snippet9", "snippet10"]
df_snippets = pd.read_csv(snes_snippets_path, sep='\t', names=snes_snippets_col_list, index_col=False)

Downloading:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Formal to Casual model loaded...


In [None]:
df_snes

Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
0,snes-04484,Pope Francis endorsed Donald Trump for president.,false,https://www.snopes.com/fact-check/pope-francis...,,Junk News,,Dan Evon,,"Pope Francis Shocks World, Endorses Donald Tru...",10 July 2016,,['None']
1,snes-03162,A law firm working with Donald Trump was named...,mixture,https://www.snopes.com/fact-check/russian-law-...,,Politicians,,Dan Evon,,Did Donald Trump Engage the ‘Russian Law Firm ...,11 January 2017,,['None']
2,snes-05436,"McDonald's is getting rid of its ""Dollar Menu""...",mostly false,https://www.snopes.com/fact-check/mcdonalds-ge...,,Business,,Kim LaCapria,,McDonald’s Getting Rid of the Dollar Menu?,29 December 2015,,['None']
3,snes-02901,A 2016 bill before the Kentucky legislature wo...,true,https://www.snopes.com/fact-check/kentucky-bil...,,Politics,,David Emery,,Kentucky Lawmaker Introduced Bill Requiring Me...,20 February 2017,,['Kentucky']
4,snes-03347,Everyone with cancer has a pH that is too acidic.,false,https://www.snopes.com/fact-check/everyone-who...,,Medical,,Alex Kasprak,,Does Everyone with Cancer Have a pH That Is To...,14 December 2016,,['None']
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5064,snes-04959,A photograph shows William Harley and Arthur D...,false,https://www.snopes.com/fact-check/harley-david...,,Uncategorized,,Dan Evon,,William Harley and Arthur Davidson Unveiling T...,4 April 2016,,['None']
5065,snes-01599,"Image depicts ""attitude adjustment"" and ""smile...",false,https://www.snopes.com/fact-check/1930s-housew...,,Fauxtography,,Kim LaCapria,,Does This Image Show a 1930s Housewife Forced ...,12 October 2017,,['United_States']
5066,snes-02900,A cook attempted to poison George Washington b...,false,https://www.snopes.com/fact-check/red-death/,,Food,,David Mikkelson,,Attempted Tomato Assassination of George Washi...,4 February 2013,,['None']
5067,snes-03298,"Cavities are contagious, primarily between int...",mixture,https://www.snopes.com/fact-check/are-cavities...,,Medical,,Kim LaCapria,,Are Cavities Contagious?,20 December 2016,,['None']


In [None]:
for index, row in df_snes.iterrows():
  full_target = ""  
  split_sentences = re.split(r'\.\.\.+', row['claim'])
  for split_sentence in split_sentences:
    target_sentence = sf.transfer(split_sentence, inference_on=1)
    print("-" *100)
    print("[Original] ", split_sentence)
    print("-" *100)
    if target_sentence is not None:
        print("[Informal] ",target_sentence)
        print()
        if full_target != "":
          full_target += " ... " + target_sentence
        else:
          full_target += target_sentence
    else:
        print("No good quality transfers available !")
  print("[Full Informal] ", full_target)
  print(str(index+1) + ": " + row['claim'])
  df_snes.at[index, 'claim'] = full_target

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
----------------------------------------------------------------------------------------------------
[Original]  The Trump International Hotel in Washington D.C. handed out flyers promoting an anti-CNN petition to guests at check-in.
----------------------------------------------------------------------------------------------------
[Informal]  at check in the Trump International Hotel in Washington D.C. distributed flyers on the anticnn petition.

[Full Informal]  at check in the Trump International Hotel in Washington D.C. distributed flyers on the anticnn petition.
4356: The Trump International Hotel in Washington D.C. handed out flyers promoting an anti-CNN petition to guests at check-in.
----------------------------------------------------------------------------------------------------
[Original]  Rihanna was barred from entering Senegal on the grounds that she was suspected of being part of the Illuminati.
--------

In [None]:
df_snes.to_csv('/content/drive/MyDrive/Deep Learning NLP/paper/snes_informal.tsv', 
               sep='\t', header=False, index=False)

In [None]:
df_snes

Unnamed: 0,claimID,claim,label,claimURL,reason,categories,speaker,checker,tags,articleTitle,publishDate,claimDate,entities
0,snes-04484,YES....POTUS FRANCIS SPOT Donald Trump FOR PRE...,false,https://www.snopes.com/fact-check/pope-francis...,,Junk News,,Dan Evon,,"Pope Francis Shocks World, Endorses Donald Tru...",10 July 2016,,['None']
1,snes-03162,The law firm working with Donald Trump won Rus...,mixture,https://www.snopes.com/fact-check/russian-law-...,,Politicians,,Dan Evon,,Did Donald Trump Engage the ‘Russian Law Firm ...,11 January 2017,,['None']
2,snes-05436,"I hear that McDonald's is getting rid of its ""...",mostly false,https://www.snopes.com/fact-check/mcdonalds-ge...,,Business,,Kim LaCapria,,McDonald’s Getting Rid of the Dollar Menu?,29 December 2015,,['None']
3,snes-02901,2016 Kentucky law would have required men to g...,true,https://www.snopes.com/fact-check/kentucky-bil...,,Politics,,David Emery,,Kentucky Lawmaker Introduced Bill Requiring Me...,20 February 2017,,['Kentucky']
4,snes-03347,Everyone with cancer has a pH that is too acid.,false,https://www.snopes.com/fact-check/everyone-who...,,Medical,,Alex Kasprak,,Does Everyone with Cancer Have a pH That Is To...,14 December 2016,,['None']
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5064,snes-04959,William Harley and Arthur Davidson unveiled th...,false,https://www.snopes.com/fact-check/harley-david...,,Uncategorized,,Dan Evon,,William Harley and Arthur Davidson Unveiling T...,4 April 2016,,['None']
5065,snes-01599,image of attitude adduction and smile therapy ...,false,https://www.snopes.com/fact-check/1930s-housew...,,Fauxtography,,Kim LaCapria,,Does This Image Show a 1930s Housewife Forced ...,12 October 2017,,['United_States']
5066,snes-02900,cook tried to poison george washington by feed...,false,https://www.snopes.com/fact-check/red-death/,,Food,,David Mikkelson,,Attempted Tomato Assassination of George Washi...,4 February 2013,,['None']
5067,snes-03298,Cavities spread a lot...it's usually between s...,mixture,https://www.snopes.com/fact-check/are-cavities...,,Medical,,Kim LaCapria,,Are Cavities Contagious?,20 December 2016,,['None']
