In [None]:
from google.colab import drive
import os
import sys
!pip install transformers
drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive')
print(os.getcwd())

Mounted at /content/drive
/content


In [None]:
!pip install -r /content/drive/MyDrive/styleformer/requirements.txt

Obtaining styleformer from git+https://github.com/PrithivirajDamodaran/Styleformer.git#egg=styleformer (from -r /content/drive/MyDrive/styleformer/requirements.txt (line 1))
  Updating ./src/styleformer clone
  Running command git fetch -q --tags
  Running command git reset --hard -q 505ca67741d4f4e4e6f1997e3842b30e3f9a21f8
Installing collected packages: styleformer
  Attempting uninstall: styleformer
    Found existing installation: styleformer 0.1
    Can't uninstall 'styleformer'. No files were found to uninstall.
  Running setup.py develop for styleformer
Successfully installed styleformer-0.1


In [None]:
# !ls /content/drive/MyDrive/styleformer/*.py
# sys.path.append('/content/drive/MyDrive/styleformer/src/styleformer')

In [None]:
class Styleformer():

  def __init__(self,  style=0):
    from transformers import AutoTokenizer
    from transformers import AutoModelForSeq2SeqLM

    self.style = style
    self.adequacy = Adequacy()
    self.model_loaded = False

    if self.style == 0:
      self.ctf_tokenizer = AutoTokenizer.from_pretrained("prithivida/informal_to_formal_styletransfer")
      self.ctf_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/informal_to_formal_styletransfer")
      print("Casual to Formal model loaded...")
      self.model_loaded = True
    elif self.style == 1:
      self.ftc_tokenizer = AutoTokenizer.from_pretrained("prithivida/formal_to_informal_styletransfer")
      self.ftc_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/formal_to_informal_styletransfer")
      print("Formal to Casual model loaded...")
      self.model_loaded = True  
    elif self.style == 2:
      self.atp_tokenizer = AutoTokenizer.from_pretrained("prithivida/active_to_passive_styletransfer")
      self.atp_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/active_to_passive_styletransfer")
      print("Active to Passive model loaded...")  
      self.model_loaded = True
    elif self.style == 3:
      self.pta_tokenizer = AutoTokenizer.from_pretrained("prithivida/passive_to_active_styletransfer")
      self.pta_model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/passive_to_active_styletransfer")
      print("Passive to Active model loaded...")        
      self.model_loaded = True
    else:
      print("Only CTF, FTC, ATP and PTA are supported in the pre-release...stay tuned")

  def transfer(self, input_sentence, inference_on=0, quality_filter=0.95, max_candidates=5):
      if self.model_loaded:
        if inference_on == 0:
          device = "cpu"
        elif inference_on == 1:
          device = "cuda:0"  
        else:  
          device = "cpu"
          print("Onnx + Quantisation is not supported in the pre-release...stay tuned.")

        if self.style == 0:
          output_sentence = self._casual_to_formal(input_sentence, device, quality_filter, max_candidates)
          return output_sentence
        elif self.style == 1:
          output_sentence = self._formal_to_casual(input_sentence, device, quality_filter, max_candidates)
          return output_sentence
        elif self.style == 2:
          output_sentence = self._active_to_passive(input_sentence, device)
          return output_sentence        
        elif self.style == 3:
          output_sentence = self._passive_to_active(input_sentence, device)
          return output_sentence           
      else:
        print("Models aren't loaded for this style, please use the right style during init")  


  def _formal_to_casual(self, input_sentence, device, quality_filter, max_candidates):
      ftc_prefix = "transfer Formal to Casual: "
      src_sentence = input_sentence
      input_sentence = ftc_prefix + input_sentence
      input_ids = self.ftc_tokenizer.encode(input_sentence, return_tensors='pt')
      self.ftc_model = self.ftc_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.ftc_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=max_candidates)
     
      gen_sentences = set()
      for pred in preds:
        gen_sentences.add(self.ftc_tokenizer.decode(pred, skip_special_tokens=True).strip())

      adequacy_scored_phrases = self.adequacy.score(src_sentence, list(gen_sentences), quality_filter, device)
      ranked_sentences = sorted(adequacy_scored_phrases.items(), key = lambda x:x[1], reverse=True)
      if len(ranked_sentences) > 0:
        return ranked_sentences[0][0]
      else:
        return None

  def _casual_to_formal(self, input_sentence, device, quality_filter, max_candidates):
      ctf_prefix = "transfer Casual to Formal: "
      src_sentence = input_sentence
      input_sentence = ctf_prefix + input_sentence
      input_ids = self.ctf_tokenizer.encode(input_sentence, return_tensors='pt')
      self.ctf_model = self.ctf_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.ctf_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=max_candidates)
     
      gen_sentences = set()
      for pred in preds:
        gen_sentences.add(self.ctf_tokenizer.decode(pred, skip_special_tokens=True).strip())

      adequacy_scored_phrases = self.adequacy.score(src_sentence, list(gen_sentences), quality_filter, device)
      ranked_sentences = sorted(adequacy_scored_phrases.items(), key = lambda x:x[1], reverse=True)
      if len(ranked_sentences) > 0:
        return ranked_sentences[0][0]
      else:
        return None

  def _active_to_passive(self, input_sentence, device):
      atp_prefix = "transfer Active to Passive: "
      src_sentence = input_sentence
      input_sentence = atp_prefix + input_sentence
      input_ids = self.atp_tokenizer.encode(input_sentence, return_tensors='pt')
      self.atp_model = self.atp_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.atp_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=1)
     
      return self.atp_tokenizer.decode(preds[0], skip_special_tokens=True).strip()

  def _passive_to_active(self, input_sentence, device):
      pta_prefix = "transfer Passive to Active: "
      src_sentence = input_sentence
      input_sentence = pta_prefix + input_sentence
      input_ids = self.pta_tokenizer.encode(input_sentence, return_tensors='pt')
      self.pta_model = self.pta_model.to(device)
      input_ids = input_ids.to(device)
      
      preds = self.pta_model.generate(
          input_ids,
          do_sample=True, 
          max_length=32, 
          top_k=50, 
          top_p=0.95, 
          early_stopping=True,
          num_return_sequences=1)
     
      return self.pta_tokenizer.decode(preds[0], skip_special_tokens=True).strip()      

    
class Adequacy():

  def __init__(self, model_tag='prithivida/parrot_adequacy_on_BART'):
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    self.nli_model = AutoModelForSequenceClassification.from_pretrained(model_tag)
    self.tokenizer = AutoTokenizer.from_pretrained(model_tag)
    
  def filter(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      top_adequacy_phrases = []
      for para_phrase in para_phrases:
        x = self.tokenizer.encode(input_phrase, para_phrase, return_tensors='pt',truncation_strategy='only_first')
        self.nli_model = self.nli_model.to(device)
        logits = self.nli_model(x.to(device))[0]
        # we throw away "neutral" (dim 1) and take the probability of "entailment" (2) as the adequacy score
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true[0].item()
        if adequacy_score >= adequacy_threshold:
            top_adequacy_phrases.append(para_phrase)
      return top_adequacy_phrases

  def score(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      adequacy_scores = {}
      for para_phrase in para_phrases:
        x = self.tokenizer.encode(input_phrase, para_phrase, return_tensors='pt',truncation_strategy='only_first')
        self.nli_model = self.nli_model.to(device)
        logits = self.nli_model(x.to(device))[0]
        # we throw away "neutral" (dim 1) and take the probability of "entailment" (2) as the adequacy score
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true[0].item()
        if adequacy_score >= adequacy_threshold:
          adequacy_scores[para_phrase] = adequacy_score
      return adequacy_scores      

In [None]:
!nvidia-smi

Wed Dec  1 06:34:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    40W / 300W |   4313MiB / 16160MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
import warnings
import transformers
import re
warnings.filterwarnings("ignore")
import pandas as pd
import os

sf = Styleformer(style = 0)

snes_path = "/content/drive/MyDrive/Deep Learning NLP/paper/snes.tsv"
snes_snippets_path = "/content/drive/MyDrive/Deep Learning NLP/paper/snes_snippets.tsv"

snes_col_list = ["claimID", "claim", "label", "claimURL", "reason", "categories", "speaker",
                 "checker", "tags", "articleTitle", "publishDate", "claimDate", "entities"]
df_snes = pd.read_csv(snes_path, sep='\t', names=snes_col_list, index_col=False)

snes_snippets_col_list = ["rank_position", "snippet1", "snippet2", "snippet3", "snippet4", 
                 "snippet5", "snippet6", "snippet7", "snippet8", "snippet9", "snippet10"]
df_snippets = pd.read_csv(snes_snippets_path, sep='\t', names=snes_snippets_col_list, index_col=False)

In [None]:
df_snippets.iloc[0]['snippet1']

'Oct 24, 2016 ... Q: Did Pope Francis endorse Donald Trump? ... “Pope Francis Shocks World,  Endorses Donald Trump for President, Releases Statement.'

In [None]:
def get_formal_snippet(snippet):
  snippet = snippet.strip()
  if snippet == 'filler' or snippet == '':
    return snippet
  full_target = ""  
  split_sentences = re.split(r'\.\.\.+', snippet)
  for split_sentence in split_sentences:
    if split_sentence.strip() != '':
      target_sentence = sf.transfer(split_sentence, inference_on=1)
      print("-" *100)
      print("[Casual] ", split_sentence)
      print("-" *100)
      if target_sentence is not None:
          print("[Formal] ",target_sentence)
          print()
          if full_target != "":
            full_target += " ... " + target_sentence
          else:
            full_target += target_sentence
      else:
          print("No good quality transfers available !")
  print("[Full Formal] ", full_target)
  return full_target

for index, row in df_snippets.iterrows():
  print(str(index+1) + " snippet 1")
  df_snippets.at[index, 'snippet1'] = get_formal_snippet(row['snippet1'])
  print(str(index+1) + " snippet 2")
  df_snippets.at[index, 'snippet2'] = get_formal_snippet(row['snippet2'])
  print(str(index+1) + " snippet 3")
  df_snippets.at[index, 'snippet3'] = get_formal_snippet(row['snippet3'])
  print(str(index+1) + " snippet 4")
  df_snippets.at[index, 'snippet4'] = get_formal_snippet(row['snippet4'])
  print(str(index+1) + " snippet 5")
  df_snippets.at[index, 'snippet5'] = get_formal_snippet(row['snippet5'])
  print(str(index+1) + " snippet 6")
  df_snippets.at[index, 'snippet6'] = get_formal_snippet(row['snippet6'])
  print(str(index+1) + " snippet 7")    
  df_snippets.at[index, 'snippet7'] = get_formal_snippet(row['snippet7'])
  print(str(index+1) + " snippet 8")
  df_snippets.at[index, 'snippet8'] = get_formal_snippet(row['snippet8']) 
  print(str(index+1) + " snippet 9")
  df_snippets.at[index, 'snippet9'] = get_formal_snippet(row['snippet9']) 
  print(str(index+1) + " snippet 10")
  df_snippets.at[index, 'snippet10'] = get_formal_snippet(row['snippet10']) 

df_snippets.to_csv('/content/drive/MyDrive/Deep Learning NLP/paper/snes_formal_snippets.tsv', 
               sep='\t', header=False, index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Casual]   them from a  bunker less than two miles from the White House. 
----------------------------------------------------------------------------------------------------
[Formal]  They are from a bunker less than two miles from the White House.

----------------------------------------------------------------------------------------------------
[Casual]   Around The Web.
----------------------------------------------------------------------------------------------------
[Formal]  It is around the internet.

[Full Formal]  The date is February 11, 2017. ... When Barack Obama stated he was emotional by antiTrump protests, he was sending a message of approval to his troops. ... Obama has an army of more than 30,000 agitators. ... They are from a bunker less than two miles from the White House. ... It is around the internet.
2475 snippet 2
----------------------------------------------------------------------------------

In [None]:
df_snippets.to_csv('/content/drive/MyDrive/Deep Learning NLP/paper/snes_formal_snippets.tsv', 
               sep='\t', header=False, index=False)


In [None]:
df_snippets

Unnamed: 0,rank_position,snippet1,snippet2,snippet3,snippet4,snippet5,snippet6,snippet7,snippet8,snippet9,snippet10
0,snes-04484,"The date is Oct 24, 2016. ... I would like to ...","December 30th, 2016. ... While a closer examin...","The date is January 14, 2018. ... However, bef...","This is the date of December 7, 2016. ... Pope...",The date is the 22 January 2018. ... Donald Tr...,"The date is January 23, 2018. ... The article ...","That's on December 6, 2016. ... An analysis co...","It was the third of April, 2018. ... This scre...","Nov 16, 2016. ... Despite the fact that Hillar...",filler
1,snes-03162,"May 16, 2017. ... I find it incredible that a ...","May 12, 2017. ... In 2016, Trump's tax law fir...","May 12, 2017. ... A law firm stated that Donal...","It occurred January 11, 2017. ... A law firm a...","January 11, 2017. ... Trump's law firm was nam...","The date is January 11, 2017. ... Sheri Dillon...","May 12, 2017. ... Trump used the law firm of t...","The date is January 11, 2017. ... Morgan Lewis...","The date is May 2, 2016. ... Chambers & Partne...",filler
2,snes-05436,"The date is Nov 18, 2015. ... McDonald's is ge...","The date is January 23, 2017. ... McDonald's i...","The date is February 24, 2016. ... Will you st...","The date of January 5, 2018. ... The launch of...","You can purchase a $1, $2, and 3 Dollar Menu. ...","The date is December 5, 2017. ... McDonald's i...",That's how it started with the shake. Creamy v...,"The date is June 29, 2018. ... Why is McDonald...","The date is February 26, 2016. ... The article...",filler
3,snes-02901,"The date is 'feb 17, 2016'. ... KY bills requi...","The date is February 12, 2016. ... A Kentucky ...","The date is February 16, 2016. ... A Kentucky ...","The date is February 16, 2016. ... A Kentucky ...","The date is Feb 15, 2016. ... CBS News is upda...","The date is February 15, 2016. ... A Kentucky ...",This Kentucky lawmaker passed a bill which req...,"February 15, 2016. ... The bill would require ...","The date is Feb 20, 2017. ... A US politician ...",filler
4,snes-03347,"The date is July 4, 2015. ... Please become a ...",Take action when you are diagnosed with cancer...,Each cancer patient has a pH which is too acid...,"The date is May 25, 2018. ... Our blood's pH i...",The importance of understanding blood pH in th...,"The date is April 4, 2013. ... If you are diag...","Equally important, although the blood pH does ...","It is August 29, 2018. ... Something with a pH...","April 1, 2014. ... Someone told me that I have...",filler
...,...,...,...,...,...,...,...,...,...,...,...
2495,snes-00316,"In Texas, a man was freed after outliving a 99...","The date is July 27, 2018. ... A 117yearold ma...","July 21, 2018 ... A 117 year old man from Texa...",There are a very long list of sentences in pri...,"This date is Dec 19, 2011. ... The longest ser...","Nov 30, 2018. ... A man was released from pris...",A vegan lion tamer was arrested for feeding hi...,"The date is Jul 24, 2010. ... Richard Honeck o...",Your browser does not currently recognize any ...,filler
2496,snes-03070,"May 11, 2017. ... The analysis of the United S...",ISIS claimed responsibility for an explosion t...,"The date is Sep 23, 2016. ... If Donald Trump ...",Obama remembers that after the second plane st...,The rise of Donald Trump in American politics ...,Certain of this work considers marriage as an ...,"June 4, 2018. ... An online user wrote, I real...","The date is the 30th of March, 2017. ... Will ...","January 9, 2018. ... The world is a vast and c...",filler
2497,snes-00049,"The time is Sep 24, 2018. ... What's more, Chr...","Sep 19, 2018. ... A photograph shows Christine...",It is Sep 22 2018. ... One of the enduring mys...,"Sep 26, 2018. ... The first photograph shows a...","Sep 27, 2018. ... Republicans might attempt to...","Sep 20, 2018. ... Christine Blasey Ford was re...","Sep 27, 2018. ... Jenny Starrs, and Jabin Bots...","Sep 25, 2018. ... Christine Blasey Ford's alle...","This occurred on Sep 19, 2018. ... Samantha Gu...",filler
2498,snes-00836,"Armenian Americans are citizens, residents of ...","The date is Jun 28, 2000. ... State for Politi...","In Russian, that is actually a transcontinenta...","May 4th, 2018. ... UNESCO aims to build on wha...",Postcolonial Trauma Fiction : Beyond the Dark ...,Intrepid explorers have joined in with the Ame...,Aug 16 is 2017. ... President Trump meets with...,"This time is July 26, 2015. ... It was argued ...",I am welcome to leave comments at your local U...,filler
