In [None]:
!pip install language_tool_python
!pip install bs4
!pip install torch
!pip install spacy
!pip install tqdm
!pip install transformers
!pip install nltk

In [None]:
import pandas as pd
df=pd.read_csv('../input/conlllabeledsentencedata/lmScoreCorrected.csv')
# print(df)
ones=0
for index in df.index:
    if df['Label'][index]:
        ones+=1
print('Number of Ones: '+str(ones))

In [None]:
import encodings
from itertools import count
import language_tool_python
from bs4 import BeautifulSoup
from nltk.translate.bleu_score import sentence_bleu
import re
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
from tqdm import tqdm
import spacy

#loading transformer model
device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

import os, psutil
process = psutil.Process(os.getpid())
print(process.memory_info().rss)  # in bytes 

#loading spacy model
nlp = spacy.load("en_core_web_sm")


tool = language_tool_python.LanguageTool('en-US')
#instead of LanguageToolPublicAPI we can also use LanguageTool to run locally
is_bad_rule = lambda rule: rule.message == 'Possible mistake found.' and len(rule.replacements) and rule.replacements[0][0].isupper()
#is_bad_rule is used to identify errors.

def get_perplexity(sentenceList):
    """
    This function is used to get the perplexity of the given sentence
    :params: python List of strings
    :return: float
    """
    encodings = tokenizer("\n\n".join(sentenceList), return_tensors="pt")
    # print(process.memory_info().rss)
    

    max_length = model.config.n_positions
    # print(max_length)
    stride = 1

    nlls = []
    # print(encodings.input_ids.size(1))
    for i in tqdm(range(1, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        # print(target_ids)
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)
        # print(nlls)
    if len(nlls)==0:
        return torch.tensor(1.0)
    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    # print(ppl)
    return ppl

def get_min_contextual_prob(sentenceList):
    """
    This function is used to get the minimum contextual probability of the given sentence
    :params: python List of strings
    :return: float
    """
    encodings = tokenizer("\n\n".join(sentenceList), return_tensors="pt")
    # print(process.memory_info().rss)

    max_length = model.config.n_positions
    # print(max_length)
    stride = 1

    nlls = []
    # print(encodings.input_ids.size(1))
    for i in tqdm(range(1, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        # print(target_ids)
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)
        # print(nlls)
    if len(nlls)==0:
        return torch.tensor(1.0)
    # print(nlls)
    ppl = torch.exp(torch.stack(nlls).max())
    # print(ppl)
    return ppl

def get_contextual_probabilites_vector(sentence):
    """
    This function is used to get the contextual probabilites vector of the given sentence
    :params: A string 
    :return: python list of floats
    """
    encodings=tokenizer(sentence,return_tensors="pt")
    max_length=model.config.n_positions
    stride=1
    nlls=[]
    for i in tqdm(range(1,encodings.input_ids.size(1),stride)):
        begin_loc=max(i+stride-max_length,0)
        end_loc=min(i+stride,encodings.input_ids.size(1))
        trg_len=end_loc-i
        input_ids=encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids=input_ids.clone()
        target_ids[:,:-trg_len]=-100
        with torch.no_grad():
            outputs=model(input_ids,labels=target_ids)
            neg_log_likelihood=outputs[0]*trg_len
        nlls.append(neg_log_likelihood.item())
    if len(nlls)==0:
        return [0.0]
    return nlls

def get_potential_positions_for_errors(probabilityVector, scoreLimit):
    """
    This function is used to get the potential positions for errors
    :params: python List of floats
    :return: python List of integers
    """
    potential_positions = []
    for i in range(len(probabilityVector)):
        if probabilityVector[i] > scoreLimit:
            potential_positions.append(i)
    return potential_positions


def remove_html_tags(text):
    """
    It takes a text and removes the html tags.
    """
    return BeautifulSoup(text, "html.parser").get_text()

def bleu(data):
    """
    It takes only one argument which is string and return us score of the grammetical accuracy of that string
    :param data:
    :return:
    score=bleu score of given string ,
    data= given string
    correct_string = string without html tags
    suggestions = suggestions suggested by bleu library
    """
    matches = tool.check(data)
    matches = [rule for rule in matches if not is_bad_rule(rule)]
    correct_string = language_tool_python.utils.correct(data, matches)
    print(correct_string)
    score = sentence_bleu([correct_string.split(" ")], data.split(" "), weights=(0.34, 0.33, 0.33, 0))
    return score

def extract_sentence(last_response):
    """
    This function is used to extracts text and comment from nested response
    :param last_response:
    :return: text and comment in dictionary format
    """
    final_sentence = []
    for response_step in last_response.get("steps"):
        for key, value in response_step.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    if sub_key == "text" or sub_key == "comment":
                        final_sentence.append({sub_key: sub_value})
            elif key == "text" or key == "comment":
                final_sentence.append({key: value})
    
    return final_sentence

def mean_bleu_score(last_response):
    """
    This function is used to extracts text and comment from nested response and calculate their bleu score
    :param last_response:
    :return: avg bleu score and list of score.
    """
    score_list = []
    bleu_score, no_of_sentences = 0, 0
    for response_step in last_response.get("steps"):
        for key, value in response_step.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    if sub_key == "text" or sub_key == "comment":
                        score_list.append(bleu(remove_html_tags(sub_value)))
                        bleu_score = bleu_score + bleu(remove_html_tags(sub_value))
                        no_of_sentences = no_of_sentences + 1
            elif key == "text" or key == "comment":
                score_list.append(bleu(remove_html_tags(value)))
                bleu_score = bleu_score + bleu(remove_html_tags(value))
                no_of_sentences = no_of_sentences + 1
    
    avg_bleu_score = bleu_score/no_of_sentences
    print("avg_bleu_score = ", avg_bleu_score)
    return score_list, avg_bleu_score

def flattenDict(nested_dict,flattenedStaticList,flattenedDynamicList):
    """
    This function is used to flatten the nested dict
    """
    if isinstance(nested_dict, dict):
        if 'jinja_template' in nested_dict:
            if isDynamicString(nested_dict['jinja_template']):
                flattenedDynamicList.append([remove_html_tags(nested_dict['jinja_template']).replace('`',''),
                                    remove_html_tags(nested_dict['rendered_value']).replace('`','')])
            else:
                flattenedStaticList.append([remove_html_tags(nested_dict['jinja_template']).replace('`',''),
                                    remove_html_tags(nested_dict['rendered_value']).replace('`','')])
        for key, value in nested_dict.items():
            if key!='jinja_template':
                flattenDict(value, flattenedStaticList,flattenedDynamicList)
    
    if isinstance(nested_dict, list):
        for item in nested_dict:
            flattenDict(item, flattenedStaticList,flattenedDynamicList)
    
    return

def isDynamicString(text):
    """
    This function is used to check if the string is  dynamic or static 
    """
    dynamicSection=re.findall('{{',text)
    newlineDynamicSection=re.findall('\n.*{{',text)
    if len(dynamicSection)==len(newlineDynamicSection):
        return False
    return True


def markPositionsForProperNouns(sentence):
    """
    This function returns the positions of words which are proper nouns
    :param string:
    :return: list of positions
    """
    # nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    positions=[]
    counter=0
    for token in doc:
        if token.tag_ == 'NNP':
            positions.append(counter)
    counter+=1
    return positions
    
def normalizeSentence(sentence):
    """
    This function is used to normalize the sentence
    """
    properNounPositions=set(markPositionsForProperNouns(sentence))
    normalizedSentence=''
    counter=0
    for word in sentence:
        if counter in properNounPositions:
            normalizedSentence+='Alice '
        else:
            normalizedSentence+=word
        normalizedSentence+=' '
        counter+=1
    normalizedSentence=normalizedSentence[0:-1]
    return normalizedSentence

In [None]:
data=[]
headers=['Sentence','Label','sum','min','max/min','max/med','med/min','med']

In [None]:
for index in df.index:
    row=[]
    probVector=get_contextual_probabilites_vector(df['Sentence'][index])
    probVector=sorted(probVector,reverse=True)
    perplexity=0.0
    for x in probVector:
        perplexity+=x
    maxLogLiklihood=probVector[0]
    minLogLiklihood=probVector[-1]
    medianLogLiklihood=probVector[len(probVector)//2]
    row.append(df['Sentence'][index])
    row.append(df['Label'][index])
    row.append(perplexity)
    row.append(maxLogLiklihood)
    row.append(maxLogLiklihood-minLogLiklihood)
    row.append(maxLogLiklihood-medianLogLiklihood)
    row.append(medianLogLiklihood-minLogLiklihood)
    row.append(medianLogLiklihood)
    data.append(row)

In [None]:
newDf=pd.DataFrame(data,columns=headers)
newDf.to_csv('additionalLmScores.csv')