In [1]:
!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git

Collecting git+https://github.com/PrithivirajDamodaran/Gramformer.git
  Cloning https://github.com/PrithivirajDamodaran/Gramformer.git to /tmp/pip-req-build-u_tlynj9
  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Gramformer.git /tmp/pip-req-build-u_tlynj9
  Resolved https://github.com/PrithivirajDamodaran/Gramformer.git to commit 23425cd2e98a919384cab6156af8adf1c9d0639a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-Levenshtein (from gramformer==1.0)
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting fuzzywuzzy (from gramformer==1.0)
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting errant (from gramformer==1.0)
  Downloading errant-3.0.0-py3-none-any.whl (499 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.3/499.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting rapidfuzz>=3.4.0 (from errant->gramformer==1.0)
  Downloading rapidfuzz-3.

In [43]:
class Gramformer:

  def __init__(self, models=1, use_gpu=False):
    from transformers import AutoTokenizer
    from transformers import AutoModelForSeq2SeqLM
    #from lm_scorer.models.auto import AutoLMScorer as LMScorer
    import errant
    self.annotator = errant.load('en')

    if use_gpu:
        device= "cuda:0"
    else:
        device = "cpu"
    batch_size = 1
    #self.scorer = LMScorer.from_pretrained("gpt2", device=device, batch_size=batch_size)
    self.device    = device
    correction_model_tag = "deepak/grammar_error_correcter_v1"
    self.model_loaded = False

    if models == 1:
        self.correction_tokenizer = AutoTokenizer.from_pretrained(correction_model_tag, use_auth_token=False)
        self.correction_model     = AutoModelForSeq2SeqLM.from_pretrained(correction_model_tag, use_auth_token=False)
        self.correction_model     = self.correction_model.to(device)
        self.model_loaded = True
        print("[Gramformer] Grammar error correct/highlight model loaded..")
    elif models == 2:
        # TODO
        print("TO BE IMPLEMENTED!!!")

  def correct(self, input_sentence, max_candidates=1):
      if self.model_loaded:
        correction_prefix = "gec: "
        input_sentence = correction_prefix + input_sentence
        input_ids = self.correction_tokenizer.encode(input_sentence, return_tensors='pt')
        input_ids = input_ids.to(self.device)

        preds = self.correction_model.generate(
            input_ids,
            do_sample=True,
            max_length=128,
           #top_k=50,
           #top_p=0.95,
            num_beams=7,
            early_stopping=True,
            num_return_sequences=max_candidates)

        corrected = set()
        for pred in preds:
          corrected.add(self.correction_tokenizer.decode(pred, skip_special_tokens=True).strip())

        #corrected = list(corrected)
        #scores = self.scorer.sentence_score(corrected, log=True)
        #ranked_corrected = [(c,s) for c, s in zip(corrected, scores)]
        #ranked_corrected.sort(key = lambda x:x[1], reverse=True)
        return corrected
      else:
        print("Model is not loaded")
        return None

  def highlight(self, orig, cor):
      edits = self._get_edits(orig, cor)
      orig_tokens = orig.split()

      ignore_indexes = []

      for edit in edits:
          edit_type = edit[0]
          edit_str_start = edit[1]
          edit_spos = edit[2]
          edit_epos = edit[3]
          edit_str_end = edit[4]

          # if no_of_tokens(edit_str_start) > 1 ==> excluding the first token, mark all other tokens for deletion
          for i in range(edit_spos+1, edit_epos):
            ignore_indexes.append(i)

          if edit_str_start == "":
              if edit_spos - 1 >= 0:
                  new_edit_str = orig_tokens[edit_spos - 1]
                  edit_spos -= 1
              else:
                  new_edit_str = orig_tokens[edit_spos + 1]
                  edit_spos += 1
              if edit_type == "PUNCT":
                st = "<a type='" + edit_type + "' edit='" + \
                    edit_str_end + "'>" + new_edit_str + "</a>"
              else:
                st = "<a type='" + edit_type + "' edit='" + new_edit_str + \
                    " " + edit_str_end + "'>" + new_edit_str + "</a>"
              orig_tokens[edit_spos] = st
          elif edit_str_end == "":
            st = "<d type='" + edit_type + "' edit=''>" + edit_str_start + "</d>"
            orig_tokens[edit_spos] = st
          else:
            st = "<c type='" + edit_type + "' edit='" + \
                edit_str_end + "'>" + edit_str_start + "</c>"
            orig_tokens[edit_spos] = st

      for i in sorted(ignore_indexes, reverse=True):
        del(orig_tokens[i])

      return(" ".join(orig_tokens))

  def detect(self, input_sentence):
        # TO BE IMPLEMENTED
        pass

  def _get_edits(self, orig, cor):
        orig = self.annotator.parse(orig)
        cor = self.annotator.parse(cor)
        alignment = self.annotator.align(orig, cor)
        edits = self.annotator.merge(alignment)

        if len(edits) == 0:
            return []

        edit_annotations = []
        for e in edits:
            e = self.annotator.classify(e)
            edit_annotations.append((e.type[2:], e.o_str, e.o_start, e.o_end,  e.c_str, e.c_start, e.c_end))

        if len(edit_annotations) > 0:
            return edit_annotations
        else:
            return []

  def get_edits(self, orig, cor):
      return self._get_edits(orig, cor)

In [44]:
from gramformer import Gramformer
import torch

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(1212)


gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector

#Applying and running on the some examples sentences
influent_sentences = [
    "Matt like fish",
    "the collection of letters was original used by the ancient Romans",
    "We enjoys horror movies",
    "Anna and Mike is going skiing",
    "I walk to the store and I bought milk",
    "We all eat the fish and then made dessert",
    "I will eat fish for dinner and drank milk",
    "what be the reason for everyone leave the company",
]

for influent_sentence in influent_sentences:
    corrected_sentences = gf.correct(influent_sentence, max_candidates=1)
    print("[Input] ", influent_sentence)
    for corrected_sentence in corrected_sentences:
      print("[Correction] ",corrected_sentence)
    print("-" *100)



[Gramformer] Grammar error correct/highlight model loaded..
[Input]  Matt like fish
[Correction]  Matt likes fish.
----------------------------------------------------------------------------------------------------
[Input]  the collection of letters was original used by the ancient Romans
[Correction]  the collection of letters was originally used by the ancient Romans
----------------------------------------------------------------------------------------------------
[Input]  We enjoys horror movies
[Correction]  We enjoy horror movies.
----------------------------------------------------------------------------------------------------
[Input]  Anna and Mike is going skiing
[Correction]  Anna and Mike are going skiing.
----------------------------------------------------------------------------------------------------
[Input]  I walk to the store and I bought milk
[Correction]  I walked to the store and I bought milk.
------------------------------------------------------------------

**Texting on the built customized dataset**

In [28]:
import pandas as pd

df = pd.read_csv("Responses_Dataset.tsv", delimiter='\t')

# Create sentence and label lists
sentences = df.Responses.values
right_sentences = df.Corrected_Responses.values #set of the correct sentences without errors

len(sentences)



34

In [31]:
#correcting the responses of ML interview questions
correct_sentences  =  []  #set of generated correct sentences by our model
for influent_sentence in sentences:
    corrected_sentences = gf.correct(influent_sentence, max_candidates=1)
    print("[Input] ", influent_sentence)
    for corrected_sentence in corrected_sentences:
      print("[Correction] ",corrected_sentence)
      correct_sentences.append(corrected_sentence)
    print("-" *100)

[Input]  One of the most common use is in market research and customer segmentation which is then utilized to target a particular market group to expand the businesses and profitable outcomes. 
[Correction]  One of the most common uses is in market research and customer segmentation which is then utilized to target a particular market group to expand the businesses and profitable outcomes.
----------------------------------------------------------------------------------------------------
[Input]  The clustering technique can be used in multiple domains of data science like image classification, customer segmentation, and recommendation engine.
[Correction]  The clustering technique can be used in multiple domains of data science like image classification, customer segmentation, and recommendation engine.
----------------------------------------------------------------------------------------------------
[Input]  The main principle behind this method is that if we will increase the num

In [32]:
len(correct_sentences)

34

In [11]:
#highlighting the corrections in the input sentence for suggestions
for influent_sentence in sentences:
    corrected_sentences = gf.correct(influent_sentence, max_candidates=1)
    print("[Input] ", influent_sentence)
    for corrected_sentence in corrected_sentences:
      print("[Edits] ", gf.highlight(influent_sentence, corrected_sentence))
    print("-" *100)

[Input]  One of the most common use is in market research and customer segmentation which is then utilized to target a particular market group to expand the businesses and profitable outcomes. 
[Edits]  One of the most common <c type='NOUN:NUM' edit='uses'>use</c> is in market research and customer segmentation which is then utilized to target a particular market group to expand the businesses and profitable outcomes.
----------------------------------------------------------------------------------------------------
[Input]  The clustering technique can be used in multiple domains of data science like image classification, customer segmentation, and recommendation engine.
[Edits]  The clustering technique can be used in multiple domains of data science like image classification, customer segmentation, and recommendation engine.
----------------------------------------------------------------------------------------------------
[Input]  The main principle behind this method is that if 

In [13]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0


In [16]:
!pip install --requirements


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --requirements


**BLEU Score

In [20]:
#importing required librarires for calculating the bleu score

import nltk
from nltk.translate.bleu_score import sentence_bleu

#Applying and calculating on an example
reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
candidate = ['this', 'is', 'a', 'test']
score = sentence_bleu(reference, candidate)
print(score)

1.0


In [22]:
# install pytorch_pretrained_bert the previous version of Pytorch-Transformers
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/86.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch-pretrained-bert)
  Downloading boto3-1.34.83-py3-none-any.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=0.4.1->pytorch-pretrained-bert)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=0.4.1->pytorch-pretrained-bert)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu1

In [33]:
#Converting the sentences into tokens
from pytorch_pretrained_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
tokenized_right_texts = [tokenizer.tokenize(sent) for sent in right_sentences]
tokenized_correct_texts =  [tokenizer.tokenize(sent) for sent in correct_sentences]
tokenized_texts[0]

['one',
 'of',
 'the',
 'most',
 'common',
 'use',
 'is',
 'in',
 'market',
 'research',
 'and',
 'customer',
 'segment',
 '##ation',
 'which',
 'is',
 'then',
 'utilized',
 'to',
 'target',
 'a',
 'particular',
 'market',
 'group',
 'to',
 'expand',
 'the',
 'businesses',
 'and',
 'profitable',
 'outcomes',
 '.']

In [36]:
len(tokenized_right_texts)

34

In [41]:
reference = []

In [42]:
#Making a set for BLEU Scores
BLEU_Score = []

for i in range(len(sentences)):
  reference.append(tokenized_right_texts[i])
  candidate = tokenized_correct_texts[i]
  score = sentence_bleu(reference, candidate)
  BLEU_Score.append(score)
  refernece = []

BLEU_Score

[0.8038026896544381,
 1.0,
 0.8747394842931362,
 0.7048050905062194,
 1.0,
 1.0,
 0.9038693388414086,
 1.0,
 1.0,
 0.860789505997226,
 1.0,
 0.9240738952215708,
 1.0,
 1.0,
 1.0,
 0.9217324939947308,
 1.0,
 1.0,
 0.9257518071011758,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9202722665493039,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9255607445912747,
 1.0,
 1.0,
 1.0]