In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB


In [2]:
# fuente: https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
def levenshtein_dist(seqa, seqb):
    seq1=seqa.lower()
    seq2=seqb.lower()
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
#     print (matrix)
    return (matrix[size_x - 1, size_y - 1])

In [3]:
train_questions=pd.read_csv("train_questions.csv", sep=";",index_col=0)
test_questions=pd.read_csv("test_questions.csv", sep=";",index_col=0)

In [4]:
domain_verbs={"Remember":["Remember","list","locate","name","recognize","state","describe","recall","repeat","retrieve"],
              "Understand":["Understand","conclude","define","illustrate","predict","tell","identify","paraphrase","summarize","categorize","classify","discuss","match","sort","compare","contrast","explain"],
              "Apply":["apply",],
              "Analyze":["analyze"],
              "Evaluate":["evaluate"],
              "Create":["create"]}

In [4]:
# domain_verbs={"Remember":["Remember","RECOGNIZe","identify","RECALL","retrieve","Choose","Define","Find","How","Label","List","Match","Name","Omit","Recall","Relate","Select","Show","Spell","Tell","What","When","Where","Which","Who","Why","describe"],
#               "Understand":["Understand","INTERPRET","clarify","paraphrase","represent","translate","EXEMPLIFY","illustrate","instantiate","CLASSIFY","categorize","subsume","SUMMARIZe","abstract","generalize","INFERR","conclude","extrapolate","interpolate","predice","COMPARe","contrast","map","match","EXPLAIN","construct models","Classify","Compare","Contrast","Demonstrate","Extend","Illustrate","Infer","Interpret","Outline","Relate","Rephrase","Show","Summarize","Translate"],
#               "Apply":["EXECUTe","carry out","IMPLEMENT","use","Apply","Build","Choose","Construct","Develop","Experiment with","Identify","Interview","Make use of","Model","Organize","Plan","Select","Solve","Utilize"],
#               "Analyze":["DIFFERENTIATe","discriminate","distinguish","focus","select","ORGANIZe","find","coherence","intergrate","outline","parse","structure","ATTRIBUTe","deconstruct","Analyze","Assume","Categorize","Classify","Compare","Conclusion","Contrast","Discover","Dissect","Distinguish","Divide","Examine","Function","Infere","Inspect","List","Motive","Relation","Simplify","Survey","Take part in","Test for","Theme"],
#               "Evaluate":["CHECK","coordinate","detect","monitor","test","CRITIQUe","judge","Agree","Appraise","Assess","Award","Choose","Compare","Conclude","Criteria","Criticize","Decide","Deduct","Defend","Determine","Disprove","Estimate","Evaluate","Explain","Importance","Influence","Interpret","Judge","Justify","Mark","Measure","Opinion","Perceive","Prioritize","Prove","Rate","Recommend","Rule on","Select","Support","Value"],
#               "Create":["GENERATe","hypothesize","PLAN","design","PRODUCe","construct","Adapt","Build","Change","Choose","Combine","Compile","Compose","Construct","Create","Delete","Design","Develop","Discuss","Elaborate","Estimate","Formulate","Happen","Imagine","Improve","Invent","Make up","Maximize","Minimize","Modify","Original","Originate","Plan","Predict","Propose","Solution","Solve","Suppose","Test","Theory"]}

In [5]:
# vectorizer = CountVectorizer()
tokenize=CountVectorizer().build_tokenizer()

In [6]:
domain_verb=[]
for domain, verbs in domain_verbs.items():
    for verb in verbs:
        domain_verb.append([domain.lower(),verb.lower()])
domain_verb=np.array(domain_verb)

In [7]:
preg_num=25

In [8]:
train_questions.question[preg_num]

"When Sarah did the chromatography test, she saw that one black ink stayed black and didn't separate. She discovered that she had used a permanent marker. She wondered what would happen if she used a different liquid. She wrote in her science notebook, What if I used vinegar in place of the water? Can I get the pigments in permanent ink to separate? How will Sarah know if vinegar separated the permanent ink?"

In [9]:
domain_proposed=[]

for process_question in train_questions.question:
    distances=[]
    for i,verb in enumerate(domain_verb):
        for tk_quest in tokenize(process_question):
            distances.append([verb[0],verb[1],tk_quest,levenshtein_dist(verb[1],tk_quest)])
    print(process_question)
    distances_df=pd.DataFrame(np.array(distances), columns=["domain", "verb", "question", "distance"])
    distances_df.distance=distances_df.distance.astype('float32')
    freqs=pd.crosstab(distances_df[distances_df.distance<2].sort_values(by=['distance']).domain, columns="count")
    domain_proposed.append(freqs.sort_values(by="count", ascending=False).index[0])
    
    print(distances_df[distances_df.distance<2].sort_values(by=['distance']))

You used several methods to separate and identify the substances in mock rocks. How did you separate the salt from the water?
          domain      verb  question  distance
51      remember  identify  identify       0.0
189     remember       how       How       0.0
1701       apply  identify  identify       0.0
387     remember      show       How       1.0
1421  understand      show       How       1.0
1541       apply       use      used       1.0
You used several methods to separate and identify the substances in mock rocks. How did you know the crystals were salt?
          domain      verb  question  distance
49      remember  identify  identify       0.0
181     remember       how       How       0.0
1624       apply  identify  identify       0.0
370     remember      show       How       1.0
481     remember     where      were       1.0
1357  understand      show       How       1.0
1471       apply       use      used       1.0
Ms. Teridann, a geologist, made a chart showing 

Susan has samples of 5 different foods. Using only the results of her experiment, how will Susan know which food contains the most sugar?
          domain   verb question  distance
197     remember    how      how       0.0
546     remember  which    which       0.0
404     remember   show      how       1.0
1485  understand   show      how       1.0
Diva's father told her she should not eat so many cookies because they were pure sugar. Diva decided to investigate the amount of sugar in Fruity Cream cookies. She performed the sugar test on 4 grams of pure sugar and on 4 grams of Fruity Cream cookies. The results are pictured at the right. Are Fruity Cream cookies pure sugar? What is your evidence?
         domain    verb question  distance
1298   remember    what     What       0.0
7906   evaluate    test     test       0.0
12556    create    test     test       0.0
1377   remember   where     were       1.0
8635   evaluate  decide  decided       1.0
Diva's father told her she should n

Below is a drawing of a river that carried sand, clay, and pebbles (past the river's mouth) to a lake. Why is the clay deposited where you indicated?
       domain   verb question  distance
572  remember  where    where       0.0
642  remember    why      Why       0.0
505  remember   what     that       1.0
617  remember    who      Why       1.0
Draw an arrow to show which direction the water flows in the river. (Rose Mountain is on the left and Happy Valley is on the right in the drawing.) Why do you think the water flows in that direction?
          domain   verb question  distance
667     remember   show     show       0.0
902     remember  which    which       0.0
1004    remember    why      Why       0.0
2500  understand   show     show       0.0
316     remember    how     show       1.0
817     remember   what     that       1.0
965     remember    who      Why       1.0
How does the water flow in a creek during a flood compared to normal water flow?
          domain     verb

Look at the circuit in the picture. The gap between the D-cell and the switch needs to be connected. Can you use a piece of aluminum foil to make the connection? Why or why not?
          domain  verb question  distance
880     remember   why      Why       0.0
882     remember   why      why       0.0
2401       apply   use      use       0.0
657     remember  tell     cell       1.0
846     remember   who      Why       1.0
848     remember   who      why       1.0
1674  understand   map      gap       1.0
Look at the diagram of a circuit at the right. Will the motor run? Why?
       domain verb question  distance
363  remember  why      Why       0.0
349  remember  who      Why       1.0
Only one circuit below will light the bulb. Which one will work? Why is that the only circuit that will work?
       domain   verb question  distance
491  remember  which    Which       0.0
537  remember    why      Why       0.0
434  remember   what     that       1.0
438  remember   what     that 

Wendy is making an electromagnet. She wrapped a long, insulated wire around an iron nail. What should Wendy do next to complete the electromagnet?
       domain  verb question  distance
474  remember  what     What       0.0
Greta used her hand to measure the length of the whiteboard in her classroom. She found that it was 14 hands long. Andre measured the whiteboard and found that it was 16 hands long. Explain why Greta and Andre got different measurements.
          domain     verb  question  distance
1085    remember      why       why       0.0
2176  understand  explain   Explain       0.0
6166    evaluate  explain   Explain       0.0
6431    evaluate  measure   measure       0.0
856     remember     what      that       1.0
868     remember     what      that       1.0
1043    remember      who       why       1.0
2941       apply      use      used       1.0
6449    evaluate  measure  measured       1.0
2 groups used marbles to weigh the same apple. Jin's group found the apple we

Nigel made solutions in the bottles as shown below. Solution A was 2 spoons solute in 50 milliliters water and was clear with no material on the bottom. Solution B was 4 spoons solute in 100 milliliters water and was clear with no material on the bottom. Nigel took 50 milliliters of solution from bottle A and put it in cup A and 50 milliliters of solution from bottle B and put it into cup B. He put the cups on each side of the balance. Which picture shows what would happen? A is heavier than B, B is heavier than A, (A and B are the same weight). Explain your choice.
           domain      verb   question  distance
2041     remember      what       what       0.0
2332     remember     which      Which       0.0
5093   understand   explain    Explain       0.0
14403    evaluate   explain    Explain       0.0
18311      create    happen     happen       0.0
19511      create  solution   Solution       0.0
19528      create  solution   Solution       0.0
19550      create  solution   solut

Jim used a solid and water to make Mixtures one (one spoon of solid in 100 milliliters water was clear with nothing on the bottom), 3 (3 spoons of solid in 100 milliliters water was clear with nothing on the bottom), 4 (4 spoons of solid in 100 milliliters water was clear with material on the bottom), and 5 (5 spoons of solid in 100 milliliters water was clear with material on the bottom) as shown below. He stirred each one and observed the results. Describe how Jim could separate the dissolved solid from the water in Mixture one. What happens to the solid and to the water?
           domain      verb  question  distance
887      remember       how       how       0.0
2112     remember      what      What       0.0
2704     remember  describe  Describe       0.0
1785     remember      show     shown       1.0
1796     remember      show       how       1.0
6532   understand      show     shown       1.0
6543   understand      show       how       1.0
7071        apply       use      us

Kate said: "An object has to move to produce sound." Do you agree with her? Why or why not?
        domain     verb question  distance
490   remember      why      Why       0.0
492   remember      why      why       0.0
2482  evaluate    agree    agree       0.0
3200    create  produce  produce       0.0
471   remember      who      Why       1.0
473   remember      who      why       1.0
3002  evaluate     rate     Kate       1.0
Why was the siren's sound designed to have this property?
       domain verb question  distance
250  remember  why      Why       0.0
240  remember  who      Why       1.0
Darla tied one end of a string around a doorknob and held the other end in her hand. When she plucked the string (pulled and let go quickly) she heard a sound. How would the pitch change if Darla pulled the string tighter?
          domain    verb question  distance
349     remember     how      How       0.0
856     remember    when     When       0.0
6913      create  change   change    

Look at the 2 solar water heaters shown here. The black triangular area is the solar collector in each container. If both heaters contain the same amount of water, would you expect the temperature change in heater X to be greater than, less than, or the same as the temperature change in heater Y after 30 minutes? (Same) Explain your answer.
           domain     verb question  distance
3013   understand  explain  Explain       0.0
8523     evaluate  explain  Explain       0.0
10009      create   change   change       0.0
10024      create   change   change       0.0
720      remember     name     same       1.0
740      remember     name     same       1.0
750      remember     name     Same       1.0
992      remember     show    shown       1.0
1283     remember    where     here       1.0
3718   understand     show    shown       1.0
The graph shows temperature data from 2 containers. One container had 100 milliliters of water and the other had 100 milliliters of dry soil. Each cont

Jack made a lifeboat out of a large paper cup. He decided to measure how far the boat sank when different numbers of marbles were added. His data are shown in the table. Identify the standard for Jack's experiment.
          domain      verb  question  distance
105     remember  identify  Identify       0.0
308     remember       how       how       0.0
794     remember      when      when       0.0
2880       apply  identify  Identify       0.0
5672    evaluate   measure   measure       0.0
641     remember      show       how       1.0
656     remember      show     shown       1.0
836     remember     where      were       1.0
2380  understand      show       how       1.0
2395  understand      show     shown       1.0
5152    evaluate    decide   decided       1.0
Explain in one sentence what each graph tells you. Graph A) number of tree rings versus time.
          domain     verb question  distance
324     remember     what     what       0.0
816   understand  explain  Explain   

As you move the multimeter leads from one bulb terminal to the next, what does it mean when the voltage reading jumps from 0 to 1.5?
       domain  verb question  distance
493  remember  what     what       0.0
521  remember  when     when       0.0
What is voltage?
      domain  verb question  distance
60  remember  what     What       0.0
What does a voltage reading of 0 tell you about the connection between a bulb terminal and a battery terminal?
       domain  verb question  distance
309  remember  tell     tell       0.0
320  remember  what     What       0.0
What does a voltage reading of 1.5 tell you about the connection between a bulb terminal and a battery terminal?
       domain  verb question  distance
309  remember  tell     tell       0.0
320  remember  what     What       0.0
Explain why you got a voltage reading of 1.5 for terminal 1 and the positive terminal.
          domain     verb question  distance
326     remember      why      why       0.0
663   understand  expl

In [10]:
domain_proposed

['remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'understand',
 'remember',
 'remember',
 'evaluate',
 'evaluate',
 'remember',
 'remember',
 'create',
 'create',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'understand',
 'remember',
 'remember',
 'remember',
 'create',
 'remember',
 'remember',
 'remember',
 'analyze',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'evaluate',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'remember',
 'create',
 'remember',
 'create',
 'remember',
 'remember',
 'remember',
 'remember',
 'apply',
 'apply',
 'remember',
 'remember',
 'remember',
 'evaluate',
 'remember',
 'create',
 'remember',
 'remember',
 'remember',
 'create',
 'create',
 'create',
 'remem

In [11]:
pd.crosstab(np.array(domain_proposed), columns="count")

col_0,count
row_0,Unnamed: 1_level_1
analyze,4
apply,2
create,13
evaluate,15
remember,146
understand,2


In [12]:
[verb for domain in domain_verbs.keys() for verb in domain_verbs[domain]]

['Remember',
 'RECOGNIZe',
 'identify',
 'RECALL',
 'retrieve',
 'Choose',
 'Define',
 'Find',
 'How',
 'Label',
 'List',
 'Match',
 'Name',
 'Omit',
 'Recall',
 'Relate',
 'Select',
 'Show',
 'Spell',
 'Tell',
 'What',
 'When',
 'Where',
 'Which',
 'Who',
 'Why',
 'describe',
 'Understand',
 'INTERPRET',
 'clarify',
 'paraphrase',
 'represent',
 'translate',
 'EXEMPLIFY',
 'illustrate',
 'instantiate',
 'CLASSIFY',
 'categorize',
 'subsume',
 'SUMMARIZe',
 'abstract',
 'generalize',
 'INFERR',
 'conclude',
 'extrapolate',
 'interpolate',
 'predice',
 'COMPARe',
 'contrast',
 'map',
 'match',
 'EXPLAIN',
 'construct models',
 'Classify',
 'Compare',
 'Contrast',
 'Demonstrate',
 'Extend',
 'Illustrate',
 'Infer',
 'Interpret',
 'Outline',
 'Relate',
 'Rephrase',
 'Show',
 'Summarize',
 'Translate',
 'EXECUTe',
 'carry out',
 'IMPLEMENT',
 'use',
 'Apply',
 'Build',
 'Choose',
 'Construct',
 'Develop',
 'Experiment with',
 'Identify',
 'Interview',
 'Make use of',
 'Model',
 'Organize',

In [13]:
text="Describe the relationship between the length of the string and the number of swings a swinger makes in 15 seconds."

In [14]:
text_distances=[]
for i,verb in enumerate(domain_verb):
    for tk_quest in tokenize(text):
        text_distances.append([verb[0],verb[1],tk_quest,levenshtein_dist(verb[1],tk_quest)])


In [15]:
text_distances_df=pd.DataFrame(np.array(text_distances), columns=["domain", "verb", "question", "distance"])
text_distances_df.distance=text_distances_df.distance.astype('float32')

In [16]:
text_distances_df[text_distances_df.distance<2].sort_values(by=['distance'])

Unnamed: 0,domain,verb,question,distance
494,remember,describe,Describe,0.0


In [17]:
freqs=pd.crosstab(distances_df[distances_df.distance<2].sort_values(by=['distance']).domain, columns="count")

In [18]:
freqs.sort_values(by="count", ascending=False).index[0]

'remember'

In [19]:
distances=[[verb,tk_quest,levenshtein_dist(verb,tk_quest)] for verb in verbs for tk_quest in tokenize(train_questions.question[preg_num])]

In [20]:
# distances_df=distances_df.loc[distances_df.distance>0,:]
distances_df.sort_values(by=['distance'])

Unnamed: 0,domain,verb,question,distance
175,remember,why,Why,0.0
168,remember,who,Why,1.0
1374,create,plan,an,2.0
150,remember,when,open,2.0
1164,create,plan,an,2.0
345,understand,map,an,2.0
147,remember,when,Why,2.0
576,apply,plan,an,2.0
140,remember,what,Why,2.0
58,remember,how,an,3.0
