In [1]:
!pip install transformers



In [130]:
#download the embedding model
import torch
import torch.nn.functional as F
from transformers import DistilBertModel, DistilBertTokenizer


#example use
# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Sample text
text = "Your example text goes here"

# Tokenize the input text and convert to tensor
inputs = tokenizer(text, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state

# `embeddings` now contains the word embeddings for each token in your input text
# To get the embedding of the first token, you can use embeddings[0][0]
embeddings[0][0]

tensor([-1.8697e-01, -2.4379e-01, -2.3030e-02, -2.1108e-01, -7.3668e-02,
        -3.1618e-01,  1.9299e-01,  4.2641e-01, -1.7054e-01, -2.9437e-01,
        -2.3479e-01, -1.4615e-01, -2.3357e-01,  7.5830e-02,  1.0693e-01,
         1.2220e-01, -1.6359e-01,  3.4595e-01,  7.2259e-02, -1.0983e-01,
        -5.0424e-02, -2.2420e-01, -1.6825e-01, -2.0311e-01,  2.2911e-01,
        -1.4189e-01,  6.1983e-02, -1.7148e-01, -2.8721e-01, -3.1866e-02,
        -5.4933e-02,  1.7424e-01, -1.2294e-02, -1.0613e-01,  3.2673e-02,
        -3.8624e-02,  2.3531e-01,  3.0975e-03,  1.2292e-01,  6.1515e-02,
        -2.1099e-01, -4.0411e-02,  3.4863e-01,  3.0660e-02,  2.1300e-01,
        -8.2312e-02, -2.0454e+00, -1.0816e-01, -2.8347e-01, -2.7752e-01,
        -1.7222e-01,  1.1456e-01,  2.4926e-01,  5.1308e-01,  8.2824e-02,
         1.0089e-01, -1.9477e-01,  3.7975e-01, -9.7084e-02,  1.4499e-01,
         1.8733e-01, -1.0721e-02, -2.6618e-02,  1.4232e-02,  6.7628e-02,
         1.0630e-01,  1.0006e-02,  3.4153e-01, -4.3

In [58]:
#download the data and organize it
%%capture
!rm -rf ./QA_data
!gdown '1VExFuOW3EerwnD0pVUh23XWi8UOygtNp'
!unzip "QA_data.zip" -d ./QA_data

#organize our data into something the model expect
import json

with open('QA_data/test_gpt_verified.json','r') as f:
  raw_test = json.load(f)
  f.close()

with open('QA_data/train_gpt15k_verified.json','r') as f:
  raw_train = json.load(f)
  f.close()

#define a funfction to organize data
def process_data(raw,name,sep = '<delimiter!>'):
  id = 0
  with open(name, 'w') as file:
    choices = ['A. ','B. ','C. ','D. ']
    for k in raw.keys():
      l = k.split(sep)
      ch = l[2]
      choice_list = []
      for i in range(1,len(choices)):
        text = ch[ch.find(choices[i-1]) + 3 : ch.find(choices[i])]
        choice_list.append({'text':text, 'label': choices[i-1][0]})
      text = ch[ch.find(choices[i]) + 3:]
      choice_list.append({'text':text, 'label': choices[i][0]})
      line = {'id': '%s'%id,
              'question': {'stem':l[0] + l[1],
                          'choices':choice_list},
              'answerKey':l[3],
              'explanation': raw[k]}
      json_string = json.dumps(line)
      file.write(json_string + '\n')
      id += 1

    file.close()


!rm -rf test_data_QA.jsonl
!rm -rf train_data_QA.jsonl

process_data(raw_train, 'train_data_QA.jsonl')
process_data(raw_test,'test_data_QA.jsonl')

In [59]:
#collect some example questions for testing
examples = []
with open('train_data_QA.jsonl','r') as file:
  count = 0
  for line in file:
    examples.append(json.loads(line))
    count += 1
    if count == 10:
      break
  file.close()

examples

[{'id': '0',
  'question': {'stem': 'Music comes in many forms;most countries have a style of their own. Poland has its folks. Hungary has its czardas. Argentina is famous for the tango. The U.S.A.is known for jazz,the type of music that has gained worldwide popularity.\nJazz is an American contribution to popular music. While classical music follows formal European tradition,jazz is rather a free form. It is full of energy,expressing the moods,interests,and emotions of the people. It is breaking and exciting with a modern sound. In the 1920s jazz sounded like America. And so it does today.\nThe origins of the music are as interesting as the music itself. Jazz was invented by American Negroes,or blacks,as they are called today,who were brought to the southern states as slaves. They were sold to farm owners and forced to work long hours in the cotton and tobacco fields. This work was hard and life was short. When a Negro died his friends and  s would gather and carry the body to have a 

In [140]:
#define a helper function for generating embedding

model = DistilBertModel.from_pretrained('distilbert-base-uncased')
def embeddings(text):
  # Tokenize the input text and convert to tensor
  inputs = tokenizer(text, return_tensors="pt")

  # Generate embeddings
  with torch.no_grad():
      outputs = model(**inputs)
      embeddings = outputs.last_hidden_state
  return embeddings

In [144]:
i = 5

explanation = examples[i]['explanation']
A = examples[i]['question']['choices'][0]['text']
B = examples[i]['question']['choices'][1]['text']
C = examples[i]['question']['choices'][2]['text']
D = examples[i]['question']['choices'][3]['text']

ex = embeddings(explanation).mean(dim=1)
exA = embeddings(A).mean(dim=1)
exB = embeddings(B).mean(dim=1)
exC = embeddings(C).mean(dim=1)
exD = embeddings(D).mean(dim=1)

print(examples[i])
#calculate the similarities
print('cosine similarities with A: %s'%F.cosine_similarity(ex,exA).item())
print('cosine similarities with B: %s'%F.cosine_similarity(ex,exB).item())
print('cosine similarities with C: %s'%F.cosine_similarity(ex,exC).item())
print('cosine similarities with D: %s'%F.cosine_similarity(ex,exD).item())
print('correct answer: %s'%examples[i]['answerKey'])

{'id': '5', 'question': {'stem': 'Doctor and Robber\nOne night about nine o\'clock, Dr. Eyck, a surgeon  , had a phone call from Dr. Haydon at the hospital in Clens Falls. The surgeon was asked to go there at once to operate on a very sick boy who shot himself while playing with a gun. The doctor was soon on his way to Clens Falls. It was 60 miles away. And it was snowing heavily in the city. The surgeon thought he could get there before 12 o\'clock.A few minutes later, the doctor was stopped by a man in an old black coat. Gun in hand, the man ordered the doctor to get out. Then the man drove the car down the road, leaving the doctor in the falling snow.\nIt was after two o\'clock in the morning when the doctor arrived at the hospital in Clens Falls.Dr. Haydon told him that the boy had died an hour before.The two doctors walked by the door of the hospital waiting room. There sat the man in the old black coat with his head in his hands.\n"MR. Cunningham," said Dr. Haydon to the man, "Th

In [143]:
#we try another one
i = 9

explanation = examples[i]['explanation']
A = examples[i]['question']['choices'][0]['text']
B = examples[i]['question']['choices'][1]['text']
C = examples[i]['question']['choices'][2]['text']
D = examples[i]['question']['choices'][3]['text']

ex = embeddings(explanation).mean(dim=1)
exA = embeddings(A).mean(dim=1)
exB = embeddings(B).mean(dim=1)
exC = embeddings(C).mean(dim=1)
exD = embeddings(D).mean(dim=1)

print(examples[i])
print()
#calculate the similarities
print('cosine similarities with A: %s'%F.cosine_similarity(ex,exA).item())
print('cosine similarities with B: %s'%F.cosine_similarity(ex,exB).item())
print('cosine similarities with C: %s'%F.cosine_similarity(ex,exC).item())
print('cosine similarities with D: %s'%F.cosine_similarity(ex,exD).item())
print('correct answer: %s'%examples[i]['answerKey'])

{'id': '9', 'question': {'stem': 'Medical experts  say most Americans do not get enough sleep. They say more Americans need to take a nap--that is to rest for a short time in the middle of the day. They give people advice to sleep lightly before continuing with other activities.The experts say naps might improve health by reducing pressure .\nSome western companies have supported the idea of napping for many years. They ask people to leave work, go home and have a nap before returning. In the United States, some companies let workers rest simply in their offices. They believe this can help workers make fewer mistakes and also increase   the amount of work that a person can do.\nSleep experts say it is likely that people make more mistakes at work than at other times. They say people should not carry out important tasks when they feel sleepy. And they say the best thing to do is to take a nap. About twenty minutes of rest is all you need. Experts say this provides extra energy and can i

In [145]:
#another one

i = 7

explanation = examples[i]['explanation']
A = examples[i]['question']['choices'][0]['text']
B = examples[i]['question']['choices'][1]['text']
C = examples[i]['question']['choices'][2]['text']
D = examples[i]['question']['choices'][3]['text']

ex = embeddings(explanation).mean(dim=1)
exA = embeddings(A).mean(dim=1)
exB = embeddings(B).mean(dim=1)
exC = embeddings(C).mean(dim=1)
exD = embeddings(D).mean(dim=1)

print(examples[i])
print()
#calculate the similarities
print('cosine similarities with A: %s'%F.cosine_similarity(ex,exA).item())
print('cosine similarities with B: %s'%F.cosine_similarity(ex,exB).item())
print('cosine similarities with C: %s'%F.cosine_similarity(ex,exC).item())
print('cosine similarities with D: %s'%F.cosine_similarity(ex,exD).item())
print('correct answer: %s'%examples[i]['answerKey'])

{'id': '7', 'question': {'stem': "As a foreigner,I don' t know the situation in the US well. Seeing all kinds of recent shooting incidents,I think it should be controlled more strictly than now. Somebody said that the person who has it in mind to kill another can do it without a gun,so the causes of murders are not guns but the trend of despising   life in the US. And they also insist that there is no clear evidence that the increasing number of murders is connected with possession  of guns,so the personal rights should not be restricted  by the reason that there is not enough evidence.\nSeveral months ago,I agreed with that partly. But as we know,the private groups like NRA have used the public opinion and persuaded the government to protect their profit. Other weapons such as knives,razors ,bats are made for their own usage. But guns are made for only one reason--to kill someone. Of course,there are people who have guns to protect themselves from the criminals. But crime is getting m