https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/

# Installing Java-8

In [None]:
import os

In [None]:
!apt-get purge openjdk-\* icedtea-\* icedtea6-\*
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!java -version

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'openjdk-9-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jdk' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre' for glob 'openjdk-*'
Note, selecting 'openjdk-6-jre' for glob 'openjdk-*'
Note, selecting 'openjdk-6-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-11-demo' for glob 'openjdk-*'
Note, selecting 'openjdk-8-demo' for glob 'openjdk-*'
Note, selecting 'openjdk-11-source' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre-dcevm' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-11-dbg' for glob 'openjdk-*'
Note, selecting 'openjdk-11-doc' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-7-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre-zero' for glob 'openjdk-*'
Note, selecting 'openjdk-8-source' for glob

# Importing Libraries

In [None]:
!pip install transformers
!pip install language_check

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 5.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 35.7MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 34.7MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting language_check
  Downloading https://files.python

In [None]:
import json
import torch
import math
import numpy as np
import pandas as pd
import language_check
import matplotlib.pyplot as plt
from google.colab import files
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [None]:
tool_us = language_check.LanguageTool('en-US')
tool_uk = language_check.LanguageTool('en-GB')

# Reading Dataset

In [12]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/haiku/results.tsv", sep='\t')

In [13]:
print(df.shape)
df.head()

(37797, 4)


Unnamed: 0,model,type,poem,haiku
0,bert-clean,stf-bert-test-clean,"another thought comes to mind, before it can b...","thought comes thoughts, sleepless"
1,bert-clean,stf-bert-test-clean,homework is such a bore others never do it m y...,homework others a ton
2,bert-clean,stf-bert-test-clean,"i had a gig-horse, and i called him pleasure b...",sundays a little jaunt
3,bert-clean,stf-bert-test-clean,"the allegory - 101 march 7, 2019 the forest is...","allegory the small full ferocious animals,"
4,bert-clean,stf-bert-test-clean,"the allegory - 161 march 18, 2019 the state is...",allegory the state full ferocious animals all ...


# Defining Functions

In [14]:
def perplexityGPT2(sentences):
  model_id = 'gpt2'
  model = GPT2LMHeadModel.from_pretrained(model_id)
  tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  ppl = list()
  total_length = len(sentences)
  for index, sent in enumerate(sentences):
    if not (index+1)%1000:
      print(f"{index+1}/{total_length}")
    tokenize_input = tokenizer.encode(sent)
    tensor_input = torch.tensor([tokenize_input])
    loss = model(tensor_input, labels=tensor_input)[0]
    ppl.append(math.exp(loss))

  return ppl

In [15]:
def checkGrammar(sentences):
  result = list()
  total_length = len(sentences)
  for index, sent in enumerate(sentences):
    if not (index+1)%1000:
      print(f"{index+1}/{total_length}")
    flag_us = not bool(tool_us.check(sent))
    flag_uk = not bool(tool_uk.check(sent))
    result.append(flag_us or flag_uk)
  return result

# Calculating Perplexity

In [None]:
ppl_scores_gpt2 = perplexityGPT2(list(df["haiku"].values))
df["ppl-gpt2"] = ppl_scores_gpt2

# Checking Grammar

In [None]:
grammar_checks = checkGrammar(list(df["haiku"].values))
df["grammar-check"] = grammar_checks

1000/54629
2000/54629
3000/54629
4000/54629
5000/54629
6000/54629
7000/54629
8000/54629


KeyboardInterrupt: ignored

# Downloading Dataset

In [None]:
df.to_json("ppl-grammar-output.json")
df.to_csv("ppl-grammar-output.csv", index=False, sep='\t')

In [None]:
df.to_json("/content/drive/MyDrive/Colab Notebooks/ppl-grammar-output.json")
df.to_csv("/content/drive/MyDrive/Colab Notebooks/ppl-grammar-output.csv", index=False, sep='\t')

In [None]:
files.download("ppl-grammar-output.json")
files.download("ppl-grammar-output.csv")