In [None]:
!pip install transformers
!pip install sentencepiece



In [None]:
#Importing ML Modules
import torch

#Importing the QA BERT Class and BERT Tokenizer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

In [None]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

#Fetching the BERT Pre-trained Model(Pre-trained Weights)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

#Fetching the BERT tokenizer (for tokenizing the input)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#Question and the answer set
question_answer_set = [
                       #{'question': 'What shall not be made as a basis of conciliation?', 'answer': 'monetary settlement'},
                       #{'question': 'Who shall the settlement be sent to to take action as specified in the recommendation?', 'answer': 'the employer or the district officer'},
                       #{'question': 'Who shall provide copies of the settlement as recorded under sub-section (2) to the aggrieved woman and the respondent?', 'answer': 'The Internal Committee or the Local Committee'},
                       #{'question': 'When a settlement is reached under sub-section (/), what shall be conducted by the Internal Committee or the Local Committee?', 'answer': 'no further inquiry'}
                       {'question': 'To what sectors Sexual Harassment at Workplace (Prevention, Prohibition and Redressal) Act, 2013 applies?', 'answer': 'Organized (govt/private) and unorganised'}
                      ]

#Separating the questions and the answers
questions = []
answers = []
for pair in question_answer_set:
  questions.append(pair['question'])
  answers.append(pair['answer'])
    
#The context to answer the questions
t = """context = 
(1) The Internal Committee or, as the case may be, the Local Committee, may, \
before initiating an inquiry under section 11 and at the request of the aggrieved woman take steps to settle \
the matter between her and the respondent through conciliation: \
Provided that no monetary settlement shall be made as a basis of conciliation. \
(2) Where settlement has been arrived at under sub-section (1), the Internal Committee or the Local \
Committee, as the case may be, shall record the settlement so arrived and forward the same to the \
employer or the District Officer to take action as specified in the recommendation. \
(3) The Internal Committee or the Local Committee, as the case may be, shall provide the copies of \
the settlement as recorded under sub-section (2) to the aggrieved woman and the respondent. \
(4) Where a settlement is arrived at under sub-section (1), no further inquiry shall be conducted by the \
Internal Committee or the Local Committee, as the case may be.
"""
context = """
(o) “workplace” includes—
(i) any department, organisation, undertaking, establishment, enterprise, institution, office,
branch or unit which is established, owned, controlled or wholly or substantially financed by
funds provided directly or indirectly by the appropriate Government or the local authority or a
Government company or a corporation or a co-operative society;
(ii) any private sector organisation or a private venture, undertaking, enterprise, institution,
establishment, society, trust, non-governmental organisation, unit or service provider carrying on
commercial, professional, vocational, educational, entertainmental, industrial, health services or
financial activities including production, supply, sale, distribution or service;
(iii) hospitals or nursing homes;
(iv) any sports institute, stadium, sports complex or competition or games venue, whether
residential or not used for training, sports or other activities relating thereto;
(v) any place visited by the employee arising out of or during the course of employment
including transportation by the employer for undertaking such journey;
(vi) a dwelling place or a house;
(p) “unorganised sector” in relation to a workplace means an enterprise owned by individuals or
self-employed workers and engaged in the production or sale of goods or providing service of any
kind whatsoever, and where the enterprise employs workers, the number of such workers is less than
ten
"""

In [None]:
len_pred_words = 0
len_shared_words = 0
len_org_words = 0
accurate = 0
for question, org_answer in zip(questions, answers):
  #Tokenize the Question and the context
  #Concatenate the question and and context and add the special tokens
  #Encode the words into word embeddings
  input_ids = tokenizer.encode(question, context)

  #Converting the token_ids back to tokens to print the answer at the end
  tokens = tokenizer.convert_ids_to_tokens(input_ids)

  #Creating the segment_id list to specify the segment embedding to be added to the word embedding
  #A(0) segment - Question, B(1) segment - Answer Ref Text
  sep_index = input_ids.index(tokenizer.sep_token_id)
  num_seg_a = sep_index + 1                 #Segment A
  num_seg_b = len(input_ids) - num_seg_a    #Segment B

  #Constructing the list
  segment_ids = [0]*num_seg_a + [1]*num_seg_b  

  #Running the model on the given question and answer ref text, specifying the segment_ids alongside
  #Fetching the start and end scores after taking dot product with the start and end vectors
  scores = model(torch.tensor([input_ids]),                            #The tokens representing our input text
                 token_type_ids=torch.tensor([segment_ids]))           #The segment IDs to differentiate question from answer_text
               
  start_scores = scores.start_logits
  end_scores = scores.end_logits  

  #Find the token index with the maximum start and end score by applying softmax activation (argmax function)
  start_index = torch.argmax(start_scores)
  end_index = torch.argmax(end_scores)

  #Processing the subword characters added by BERT to get a well organised answer
  answer = tokens[start_index]
  for i in range(start_index + 1, end_index + 1):
      #subword token is added to the previous token to complete a word
      if tokens[i][0:2] == '##':
          answer += tokens[i][2:]

      #else add the token directly to the answer with a whitespace
      else:
          answer += ' ' + tokens[i]
  print('Answer: "' + answer + '"')
  s = """
  #Evaluating the performance of the model by calculating Simple Accuracy and F1 measure
    len_pred_words += len(answer.split())
    len_org_words += len(org_answer.split())
    for x, y in zip(answer.lower().split(), org_answer.lower().split()):
      if x == y :
        len_shared_words += 1
    if(answer.lower() == org_answer.lower()) : accurate += 1

  #Finding the F1 score, precision and recall
  precision = len_shared_words / len_pred_words
  recall = len_shared_words / len_org_words
  F1 = (2*precision*recall) / (recall + precision)

  #printing the simple accuracy and the F1 score
  print("F1 : {}".format(F1*100))
  print("Simple Accuracy : {}".format(accurate*100/len(questions)))
  """

Answer: "commercial , professional , vocational , educational , entertainmental , industrial , health services or financial activities including production , supply , sale , distribution or service ; ( iii ) hospitals or nursing homes ; ( iv ) any sports institute , stadium , sports complex or competition or games venue , whether residential or not used for training , sports or other activities relating thereto ; ( v ) any place visited by the employee arising out of or during the course of employment including transportation by the employer for undertaking such journey ; ( vi ) a dwelling place or a house ; ( p ) “ unorganised"


In [None]:
"""
#Printing the result of the tokenizer in the form of tokens and token_ids and number of tokens created
print('The input has a total of {} tokens.'.format(len(input_ids)))

for token, id in zip(tokens, input_ids):
    #Add some mark around the [SEP] token, to make it stand out.
    if id == tokenizer.sep_token_id:
        print('-------------------')

    #Print the token and the corresponding ID
    print('{:<12} {}'.format(token, id))

    #Add some mark around the [SEP] token, to make it stand out.
    if id == tokenizer.sep_token_id:
        print('-------------------')
"""


"\n#Printing the result of the tokenizer in the form of tokens and token_ids and number of tokens created\nprint('The input has a total of {} tokens.'.format(len(input_ids)))\n\nfor token, id in zip(tokens, input_ids):\n    #Add some mark around the [SEP] token, to make it stand out.\n    if id == tokenizer.sep_token_id:\n        print('-------------------')\n\n    #Print the token and the corresponding ID\n    print('{:<12} {}'.format(token, id))\n\n    #Add some mark around the [SEP] token, to make it stand out.\n    if id == tokenizer.sep_token_id:\n        print('-------------------')\n"