"""
References:
https://github.com/sunilchomal/GECwBERT/blob/master/BERT_GED_Model.ipynb
https://colab.research.google.com/drive/1vaPCdmdYojxCCnZQs1r4CC57Lof_GGVj?usp=sharing#scrollTo=ZDoyjyA79C9h
https://analyticsindiamag.com/how-to-use-bert-transformer-for-grammar-checking/


"""

In [None]:
"""
Important:

Need Transformers version of 2.11 to run

"""

In [1]:
#Import required libraries
import pandas as pd
import numpy as np
from transformers import BertTokenizer

In [2]:
#Import data
data = pd.read_csv("review_data.csv")

In [None]:
#data['text']

In [5]:
#data cleaning
import re
import string
def text_cleaning(text):
    '''
    Input: Text that needs to be cleaned
    This function Make text lowercase, remove text in square brackets,remove punctuation
    and remove words containing numbers.
    Output: Cleaned text
    '''
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
data['text'] = data['text'].apply(lambda x:text_cleaning(x))

In [6]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch
#Importing pre-trained model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case = True)
model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels = 2)
model.load_state_dict(torch.load('bert-based-uncased-GED.pth',map_location=torch.device('cpu')))


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

<All keys matched successfully>

In [7]:

def prediction(sent):
    
    """
    Input: sentence
    
    This function encodes the input sentence, uses model defined and predicts if the sentence
    is gramatically correct or not   
    
    Output: gives if the input text is gramatically correcxt or not
    """
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation = True,           # Pad & truncate all sentences.
                        padding = 'longest',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_id = encoded_dict['input_ids']
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_mask = encoded_dict['attention_mask']
    input_id = torch.LongTensor(input_id)
    attention_mask = torch.LongTensor(attention_mask)
    outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

    logits = outputs[0]
    index = logits.argmax()
    if index == 1:
        return ("Gramatically correct")
    else:
        return ("Gramatically in-correct")

In [8]:
#applying prediction function to text column of the dataset

data['prediction'] = data['text'].apply(lambda x:prediction(x))



In [9]:
data[['text','prediction']]

Unnamed: 0,text,prediction
0,anathi khanyile,Gramatically correct
1,tony bahut funny hai hill climbing racing my f...,Gramatically in-correct
2,teturwu,Gramatically correct
3,hoooooooooooyaaaaaaaaa what a game hoooooooooo...,Gramatically in-correct
4,this game is nice,Gramatically correct
...,...,...
29995,plz my vidos viral,Gramatically in-correct
29996,nice,Gramatically correct
29997,disturbing too much as youtube ad,Gramatically in-correct
29998,koub valo vai,Gramatically correct
