https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/

# Installing Java-8

In [1]:
import os

In [2]:
!apt-get purge openjdk-\* icedtea-\* icedtea6-\*
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!java -version

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'openjdk-9-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jdk' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre' for glob 'openjdk-*'
Note, selecting 'openjdk-6-jre' for glob 'openjdk-*'
Note, selecting 'openjdk-6-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-11-demo' for glob 'openjdk-*'
Note, selecting 'openjdk-8-demo' for glob 'openjdk-*'
Note, selecting 'openjdk-11-source' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre-dcevm' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-11-dbg' for glob 'openjdk-*'
Note, selecting 'openjdk-11-doc' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-7-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre-zero' for glob 'openjdk-*'
Note, selecting 'openjdk-8-source' for glob

# Importing Libraries

In [3]:
!pip install transformers
!pip install language_check

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 4.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 37.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.2MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting language_check
  Downloading https://files.pythonhosted

In [4]:
import json
import torch
import math
import numpy as np
import pandas as pd
import language_check
import matplotlib.pyplot as plt
from google.colab import files
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [5]:
tool_us = language_check.LanguageTool('en-US')
tool_uk = language_check.LanguageTool('en-GB')

# Reading Dataset

In [6]:
with open ("/content/drive/MyDrive/Colab Notebooks/dataset.json") as d:
  dfd_json = json.load(d)

In [7]:
poems = list()
haikus = list()
indices = list()

MIN_THRESHOLD_HAIKU = 5
MAX_THRESHOLD_POEM = 120

for dataset in dfd_json:
  for poem in dataset:
    for grammar_index in poem:
      if grammar_index == "poem":
        continue
      else:
        haiku_data = poem[grammar_index]
        for haiku, index in list(haiku_data.items()):
          if len(index) >= MIN_THRESHOLD_HAIKU and len(poem["poem"].split()) <= MAX_THRESHOLD_POEM:
            poems.append(poem["poem"])
            haikus.append(haiku)
            indices.append(index)

In [8]:
len(poems), len(haikus), len(indices)

(54629, 54629, 54629)

In [9]:
df = pd.DataFrame()
df["poem"] = poems
df["haiku"] = haikus
df["indices"] = indices
df = df.reset_index(drop=True)

In [10]:
print(df.shape)
df.head()

(54629, 3)


Unnamed: 0,poem,haiku,indices
0,Did the CIA tell the FBI that it knows the wor...,cia fbi the biggest weapon,"[2, 5, 9, 24, 25]"
1,Did the CIA tell the FBI that it knows the wor...,cia fbi the biggest weapon,"[2, 5, 9, 24, 25]"
2,"Dark clouds gathered overhead,\nExpelling bull...",clouds overhead bullets of the valley,"[1, 3, 5, 6, 10, 11]"
3,A vigilante lacking of heroic qualities that\n...,lacking qualities that damn criminals,"[2, 5, 6, 11, 12]"
4,"(A Diamante Poem)\nBrain\nHeavenly, hellish\nF...",diamante poem the sybaritic pathetic,"[1, 2, 10, 18, 19]"


# Defining Functions

In [11]:
def perplexityGPT2(sentences):
  model_id = 'gpt2'
  model = GPT2LMHeadModel.from_pretrained(model_id)
  tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  ppl = list()
  total_length = len(sentences)
  for index, sent in enumerate(sentences):
    if not (index+1)%1000:
      print(f"{index+1}/{total_length}")
    tokenize_input = tokenizer.encode(sent)
    tensor_input = torch.tensor([tokenize_input])
    loss = model(tensor_input, labels=tensor_input)[0]
    ppl.append(math.exp(loss))

  return ppl

In [19]:
def checkGrammar(sentences):
  result = list()
  total_length = len(sentences)
  for index, sent in enumerate(sentences):
    if not (index+1)%1000:
      print(f"{index+1}/{total_length}")
    flag_us = not bool(tool_us.check(sent))
    flag_uk = not bool(tool_uk.check(sent))
    result.append(flag_us or flag_uk)
  return result

# Calculating Perplexity

In [None]:
ppl_scores_gpt2 = perplexityGPT2(list(df["haiku"].values))
df["ppl-gpt2"] = ppl_scores_gpt2

# Checking Grammar

In [None]:
grammar_checks = checkGrammar(list(df["haiku"].values))
df["grammar-check"] = grammar_checks

1000/54629


# Downloading Dataset

In [None]:
df.to_json("ppl-grammar-dataset.json")
files.download("ppl-grammar-dataset.json")